itrans8_dspr2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_config.h"
12#include "./vpx_dsp_rtcd.h"
13#include "vpx_dsp/mips/inv_txfm_dspr2.h"
14#include "vpx_dsp/txfm_common.h"
15
16#if HAVE_DSPR2
17void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
18  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
19  const int const_2_power_13 = 8192;
20  int Temp0, Temp1, Temp2, Temp3, Temp4;
21  int i;
22
23  for (i = no_rows; i--; ) {
24    __asm__ __volatile__ (
25        /*
26          temp_1 = (input[0] + input[4]) * cospi_16_64;
27          step2_0 = dct_const_round_shift(temp_1);
28
29          temp_2 = (input[0] - input[4]) * cospi_16_64;
30          step2_1 = dct_const_round_shift(temp_2);
31        */
32        "lh       %[Temp0],             0(%[input])                     \n\t"
33        "lh       %[Temp1],             8(%[input])                     \n\t"
34        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
35        "mthi     $zero,                $ac0                            \n\t"
36        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
37        "mthi     $zero,                $ac1                            \n\t"
38        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
39        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
40        "extp     %[Temp4],             $ac0,           31              \n\t"
41
42        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
43        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
44        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
45        "mthi     $zero,                $ac0                            \n\t"
46        "extp     %[Temp2],             $ac1,           31              \n\t"
47
48        /*
49          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
50          step2_2 = dct_const_round_shift(temp_1);
51        */
52        "lh       %[Temp0],             4(%[input])                     \n\t"
53        "lh       %[Temp1],             12(%[input])                    \n\t"
54        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
55        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
56        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
57        "mthi     $zero,                $ac1                            \n\t"
58        "extp     %[Temp3],             $ac0,           31              \n\t"
59
60        /*
61          step1_1 = step2_1 + step2_2;
62          step1_2 = step2_1 - step2_2;
63        */
64        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
65        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
66
67        /*
68          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
69          step2_3 = dct_const_round_shift(temp_2);
70        */
71        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
72        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
73        "extp     %[Temp1],             $ac1,           31              \n\t"
74
75        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
76        "mthi     $zero,                $ac0                            \n\t"
77
78        /*
79          step1_0 = step2_0 + step2_3;
80          step1_3 = step2_0 - step2_3;
81        */
82        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
83        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
84
85        /*
86          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
87          step1_4 = dct_const_round_shift(temp_1);
88        */
89        "lh       %[Temp0],             2(%[input])                     \n\t"
90        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
91        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
92        "mthi     $zero,                $ac1                            \n\t"
93        "lh       %[Temp1],             14(%[input])                    \n\t"
94        "lh       %[Temp0],             2(%[input])                     \n\t"
95        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
96        "extp     %[step1_4],           $ac0,           31              \n\t"
97
98        /*
99          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
100          step1_7 = dct_const_round_shift(temp_2);
101        */
102        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
103        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
104        "extp     %[step1_7],           $ac1,           31              \n\t"
105
106        /*
107          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
108          step1_5 = dct_const_round_shift(temp_1);
109        */
110        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
111        "mthi     $zero,                $ac0                            \n\t"
112        "lh       %[Temp0],             10(%[input])                    \n\t"
113        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
114        "lh       %[Temp1],             6(%[input])                     \n\t"
115        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
116        "extp     %[step1_5],           $ac0,           31              \n\t"
117
118        /*
119          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
120          step1_6 = dct_const_round_shift(temp_2);
121        */
122        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
123        "mthi     $zero,                $ac1                            \n\t"
124        "lh       %[Temp0],             10(%[input])                    \n\t"
125        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
126        "lh       %[Temp1],             6(%[input])                     \n\t"
127        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
128        "extp     %[step1_6],           $ac1,           31              \n\t"
129
130        /*
131          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
132          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
133        */
134        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
135        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
136        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
137        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
138        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
139        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
140
141        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
142        "mthi     $zero,                $ac0                            \n\t"
143        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
144        "mthi     $zero,                $ac1                            \n\t"
145
146        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
147        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
148
149        /*
150          step1_4 = step1_4 + step1_5;
151          step1_7 = step1_6 + step1_7;
152        */
153        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
154        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
155
156        "extp     %[step1_5],           $ac0,           31              \n\t"
157        "extp     %[step1_6],           $ac1,           31              \n\t"
158
159        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
160        "sh       %[Temp0],             0(%[output])                    \n\t"
161        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
162        "sh       %[Temp1],             16(%[output])                   \n\t"
163        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
164        "sh       %[Temp0],             32(%[output])                   \n\t"
165        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
166        "sh       %[Temp1],             48(%[output])                   \n\t"
167
168        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
169        "sh       %[Temp0],             64(%[output])                   \n\t"
170        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
171        "sh       %[Temp1],             80(%[output])                   \n\t"
172        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
173        "sh       %[Temp0],             96(%[output])                   \n\t"
174        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
175        "sh       %[Temp1],             112(%[output])                  \n\t"
176
177        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
178          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
179          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
180          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
181          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
182          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
183          [Temp4] "=&r" (Temp4)
184        : [const_2_power_13] "r" (const_2_power_13),
185          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
186          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
187          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
188          [cospi_24_64] "r" (cospi_24_64),
189          [output] "r" (output), [input] "r" (input)
190    );
191
192    input += 8;
193    output += 1;
194  }
195}
196
197void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
198                                 int dest_stride) {
199  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
200  int Temp0, Temp1, Temp2, Temp3;
201  int i;
202  const int const_2_power_13 = 8192;
203  uint8_t *dest_pix;
204  uint8_t *cm = vpx_ff_cropTbl;
205
206  /* prefetch vpx_ff_cropTbl */
207  prefetch_load(vpx_ff_cropTbl);
208  prefetch_load(vpx_ff_cropTbl +  32);
209  prefetch_load(vpx_ff_cropTbl +  64);
210  prefetch_load(vpx_ff_cropTbl +  96);
211  prefetch_load(vpx_ff_cropTbl + 128);
212  prefetch_load(vpx_ff_cropTbl + 160);
213  prefetch_load(vpx_ff_cropTbl + 192);
214  prefetch_load(vpx_ff_cropTbl + 224);
215
216  for (i = 0; i < 8; ++i) {
217      dest_pix = (dest + i);
218
219    __asm__ __volatile__ (
220        /*
221          temp_1 = (input[0] + input[4]) * cospi_16_64;
222          step2_0 = dct_const_round_shift(temp_1);
223
224          temp_2 = (input[0] - input[4]) * cospi_16_64;
225          step2_1 = dct_const_round_shift(temp_2);
226        */
227        "lh       %[Temp0],             0(%[input])                     \n\t"
228        "lh       %[Temp1],             8(%[input])                     \n\t"
229        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
230        "mthi     $zero,                $ac0                            \n\t"
231        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
232        "mthi     $zero,                $ac1                            \n\t"
233        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
234        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
235        "extp     %[step1_6],           $ac0,           31              \n\t"
236
237        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
238        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
239        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
240        "mthi     $zero,                $ac0                            \n\t"
241        "extp     %[Temp2],             $ac1,           31              \n\t"
242
243        /*
244          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
245          step2_2 = dct_const_round_shift(temp_1);
246        */
247        "lh       %[Temp0],             4(%[input])                     \n\t"
248        "lh       %[Temp1],             12(%[input])                    \n\t"
249        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
250        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
251        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
252        "mthi     $zero,                $ac1                            \n\t"
253        "extp     %[Temp3],             $ac0,           31              \n\t"
254
255        /*
256          step1_1 = step2_1 + step2_2;
257          step1_2 = step2_1 - step2_2;
258        */
259        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
260        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
261
262        /*
263          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
264          step2_3 = dct_const_round_shift(temp_2);
265        */
266        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
267        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
268        "extp     %[Temp1],             $ac1,           31              \n\t"
269
270        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
271        "mthi     $zero,                $ac0                            \n\t"
272
273        /*
274          step1_0 = step2_0 + step2_3;
275          step1_3 = step2_0 - step2_3;
276        */
277        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
278        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
279
280        /*
281          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
282          step1_4 = dct_const_round_shift(temp_1);
283        */
284        "lh       %[Temp0],             2(%[input])                     \n\t"
285        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
286        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
287        "mthi     $zero,                $ac1                            \n\t"
288        "lh       %[Temp1],             14(%[input])                    \n\t"
289        "lh       %[Temp0],             2(%[input])                     \n\t"
290        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
291        "extp     %[step1_4],           $ac0,           31              \n\t"
292
293        /*
294          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
295          step1_7 = dct_const_round_shift(temp_2);
296        */
297        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
298        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
299        "extp     %[step1_7],           $ac1,           31              \n\t"
300
301        /*
302          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
303          step1_5 = dct_const_round_shift(temp_1);
304        */
305        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
306        "mthi     $zero,                $ac0                            \n\t"
307        "lh       %[Temp0],             10(%[input])                    \n\t"
308        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
309        "lh       %[Temp1],             6(%[input])                     \n\t"
310        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
311        "extp     %[step1_5],           $ac0,           31              \n\t"
312
313        /*
314          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
315          step1_6 = dct_const_round_shift(temp_2);
316        */
317        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
318        "mthi     $zero,                $ac1                            \n\t"
319        "lh       %[Temp0],             10(%[input])                    \n\t"
320        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
321        "lh       %[Temp1],             6(%[input])                     \n\t"
322        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
323        "extp     %[step1_6],           $ac1,           31              \n\t"
324
325        /*
326          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
327          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
328        */
329        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
330        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
331        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
332        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
333        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
334        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
335
336        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
337        "mthi     $zero,                $ac0                            \n\t"
338        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
339        "mthi     $zero,                $ac1                            \n\t"
340
341        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
342        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
343
344        /*
345          step1_4 = step1_4 + step1_5;
346          step1_7 = step1_6 + step1_7;
347        */
348        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
349        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
350
351        "extp     %[step1_5],           $ac0,           31              \n\t"
352        "extp     %[step1_6],           $ac1,           31              \n\t"
353
354        /* add block */
355        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
356        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
357        "addi     %[Temp0],             %[Temp0],       16              \n\t"
358        "sra      %[Temp0],             %[Temp0],       5               \n\t"
359        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
360        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
361        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
362        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
363        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
364
365        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
366        "addi     %[Temp0],             %[Temp0],       16              \n\t"
367        "sra      %[Temp0],             %[Temp0],       5               \n\t"
368        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
369        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
370        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
371        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
372        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
373
374        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
375        "addi     %[Temp0],             %[Temp0],       16              \n\t"
376        "sra      %[Temp0],             %[Temp0],       5               \n\t"
377        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
378        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
379        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
380        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
381        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
382
383        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
384        "addi     %[Temp0],             %[Temp0],       16              \n\t"
385        "sra      %[Temp0],             %[Temp0],       5               \n\t"
386        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
387        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
388        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
389        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
390        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
391
392        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
393        "addi     %[Temp0],             %[Temp0],       16              \n\t"
394        "sra      %[Temp0],             %[Temp0],       5               \n\t"
395        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
396        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
397        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
398        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
399        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
400
401        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
402        "addi     %[Temp0],             %[Temp0],       16              \n\t"
403        "sra      %[Temp0],             %[Temp0],       5               \n\t"
404        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
405        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
406        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
407        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
408        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
409
410        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
411        "addi     %[Temp0],             %[Temp0],       16              \n\t"
412        "sra      %[Temp0],             %[Temp0],       5               \n\t"
413        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
414        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
415        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
416        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
417        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
418
419        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
420        "addi     %[Temp0],             %[Temp0],       16              \n\t"
421        "sra      %[Temp0],             %[Temp0],       5               \n\t"
422        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
423        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
424        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
425
426        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
427          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
428          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
429          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
430          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
431          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
432          [dest_pix] "+r" (dest_pix)
433        : [const_2_power_13] "r" (const_2_power_13),
434          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
435          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
436          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
437          [cospi_24_64] "r" (cospi_24_64),
438          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
439    );
440
441    input += 8;
442  }
443}
444
445void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
446                              int dest_stride) {
447  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
448  int16_t *outptr = out;
449  uint32_t pos = 45;
450
451  /* bit positon for extract from acc */
452  __asm__ __volatile__ (
453    "wrdsp    %[pos],    1    \n\t"
454    :
455    : [pos] "r" (pos)
456  );
457
458  // First transform rows
459  idct8_rows_dspr2(input, outptr, 8);
460
461  // Then transform columns and add to dest
462  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
463}
464
465void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
466                              int dest_stride) {
467  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
468  int16_t *outptr = out;
469  uint32_t pos = 45;
470
471  /* bit positon for extract from acc */
472  __asm__ __volatile__ (
473    "wrdsp    %[pos],    1    \n\t"
474    :
475    : [pos] "r" (pos)
476  );
477
478  // First transform rows
479  idct8_rows_dspr2(input, outptr, 4);
480
481  outptr += 4;
482
483  __asm__ __volatile__ (
484      "sw  $zero,   0(%[outptr])  \n\t"
485      "sw  $zero,   4(%[outptr])  \n\t"
486      "sw  $zero,  16(%[outptr])  \n\t"
487      "sw  $zero,  20(%[outptr])  \n\t"
488      "sw  $zero,  32(%[outptr])  \n\t"
489      "sw  $zero,  36(%[outptr])  \n\t"
490      "sw  $zero,  48(%[outptr])  \n\t"
491      "sw  $zero,  52(%[outptr])  \n\t"
492      "sw  $zero,  64(%[outptr])  \n\t"
493      "sw  $zero,  68(%[outptr])  \n\t"
494      "sw  $zero,  80(%[outptr])  \n\t"
495      "sw  $zero,  84(%[outptr])  \n\t"
496      "sw  $zero,  96(%[outptr])  \n\t"
497      "sw  $zero, 100(%[outptr])  \n\t"
498      "sw  $zero, 112(%[outptr])  \n\t"
499      "sw  $zero, 116(%[outptr])  \n\t"
500
501      :
502      : [outptr] "r" (outptr)
503  );
504
505
506  // Then transform columns and add to dest
507  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
508}
509
510void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
511                             int dest_stride) {
512  uint32_t pos = 45;
513  int32_t out;
514  int32_t r;
515  int32_t a1, absa1;
516  int32_t t1, t2, vector_a1, vector_1, vector_2;
517
518  /* bit positon for extract from acc */
519  __asm__ __volatile__ (
520    "wrdsp      %[pos],     1           \n\t"
521
522    :
523    : [pos] "r" (pos)
524  );
525
526  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
527  __asm__ __volatile__ (
528      "addi     %[out],     %[out],     16      \n\t"
529      "sra      %[a1],      %[out],     5       \n\t"
530
531      : [out] "+r" (out), [a1] "=r" (a1)
532      :
533  );
534
535  if (a1 < 0) {
536    /* use quad-byte
537     * input and output memory are four byte aligned */
538    __asm__ __volatile__ (
539        "abs        %[absa1],       %[a1]       \n\t"
540        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
541
542        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
543        : [a1] "r" (a1)
544    );
545
546    for (r = 8; r--;) {
547      __asm__ __volatile__ (
548          "lw           %[t1],          0(%[dest])                      \n\t"
549          "lw           %[t2],          4(%[dest])                      \n\t"
550          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
551          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
552          "sw           %[vector_1],    0(%[dest])                      \n\t"
553          "sw           %[vector_2],    4(%[dest])                      \n\t"
554          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
555
556          : [t1] "=&r" (t1), [t2] "=&r" (t2),
557            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
558            [dest] "+&r" (dest)
559          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
560      );
561    }
562  } else {
563    /* use quad-byte
564     * input and output memory are four byte aligned */
565    __asm__ __volatile__ (
566        "replv.qb   %[vector_a1],   %[a1]   \n\t"
567
568        : [vector_a1] "=r" (vector_a1)
569        : [a1] "r" (a1)
570    );
571
572    for (r = 8; r--;) {
573      __asm__ __volatile__ (
574          "lw           %[t1],          0(%[dest])                      \n\t"
575          "lw           %[t2],          4(%[dest])                      \n\t"
576          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
577          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
578          "sw           %[vector_1],    0(%[dest])                      \n\t"
579          "sw           %[vector_2],    4(%[dest])                      \n\t"
580          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
581
582          : [t1] "=&r" (t1), [t2] "=&r" (t2),
583            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
584            [dest] "+r" (dest)
585          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
586      );
587    }
588  }
589}
590
591void iadst8_dspr2(const int16_t *input, int16_t *output) {
592  int s0, s1, s2, s3, s4, s5, s6, s7;
593  int x0, x1, x2, x3, x4, x5, x6, x7;
594
595  x0 = input[7];
596  x1 = input[0];
597  x2 = input[5];
598  x3 = input[2];
599  x4 = input[3];
600  x5 = input[4];
601  x6 = input[1];
602  x7 = input[6];
603
604  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
605    output[0] = output[1] = output[2] = output[3] = output[4]
606              = output[5] = output[6] = output[7] = 0;
607    return;
608  }
609
610  // stage 1
611  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
612  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
613  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
614  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
615  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
616  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
617  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
618  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
619
620  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
621  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
622  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
623  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
624  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
625  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
626  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
627  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
628
629  // stage 2
630  s0 = x0;
631  s1 = x1;
632  s2 = x2;
633  s3 = x3;
634  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
635  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
636  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
637  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
638
639  x0 = s0 + s2;
640  x1 = s1 + s3;
641  x2 = s0 - s2;
642  x3 = s1 - s3;
643  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
644  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
645  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
646  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
647
648  // stage 3
649  s2 = cospi_16_64 * (x2 + x3);
650  s3 = cospi_16_64 * (x2 - x3);
651  s6 = cospi_16_64 * (x6 + x7);
652  s7 = cospi_16_64 * (x6 - x7);
653
654  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
655  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
656  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
657  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
658
659  output[0] =  x0;
660  output[1] = -x4;
661  output[2] =  x6;
662  output[3] = -x2;
663  output[4] =  x3;
664  output[5] = -x7;
665  output[6] =  x5;
666  output[7] = -x1;
667}
668#endif  // HAVE_DSPR2
669