1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_config.h" 15#include "vpx_dsp/mips/inv_txfm_dspr2.h" 16#include "vpx_dsp/txfm_common.h" 17 18#if HAVE_DSPR2 19static void idct32_rows_dspr2(const int16_t *input, int16_t *output, 20 uint32_t no_rows) { 21 int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; 22 int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; 23 int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; 24 int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; 25 int16_t step1_28, step1_29, step1_30, step1_31; 26 int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; 27 int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; 28 int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; 29 int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; 30 int16_t step2_28, step2_29, step2_30, step2_31; 31 int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; 32 int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; 33 int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; 34 int16_t step3_29, step3_30, step3_31; 35 int temp0, temp1, temp2, temp3; 36 int load1, load2, load3, load4; 37 int result1, result2; 38 int temp21; 39 int i; 40 const int const_2_power_13 = 8192; 41 const int32_t *input_int; 42 43 for (i = no_rows; i--; ) { 44 input_int = (const int32_t *)input; 45 46 if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | 47 input_int[4] | input_int[5] | input_int[6] | input_int[7] | 48 input_int[8] | input_int[9] | input_int[10] | input_int[11] | 49 input_int[12] | input_int[13] | input_int[14] | input_int[15])) { 50 input += 32; 51 52 __asm__ __volatile__ ( 53 "sh $zero, 0(%[output]) \n\t" 54 "sh $zero, 64(%[output]) \n\t" 55 "sh $zero, 128(%[output]) \n\t" 56 "sh $zero, 192(%[output]) \n\t" 57 "sh $zero, 256(%[output]) \n\t" 58 "sh $zero, 320(%[output]) \n\t" 59 "sh $zero, 384(%[output]) \n\t" 60 "sh $zero, 448(%[output]) \n\t" 61 "sh $zero, 512(%[output]) \n\t" 62 "sh $zero, 576(%[output]) \n\t" 63 "sh $zero, 640(%[output]) \n\t" 64 "sh $zero, 704(%[output]) \n\t" 65 "sh $zero, 768(%[output]) \n\t" 66 "sh $zero, 832(%[output]) \n\t" 67 "sh $zero, 896(%[output]) \n\t" 68 "sh $zero, 960(%[output]) \n\t" 69 "sh $zero, 1024(%[output]) \n\t" 70 "sh $zero, 1088(%[output]) \n\t" 71 "sh $zero, 1152(%[output]) \n\t" 72 "sh $zero, 1216(%[output]) \n\t" 73 "sh $zero, 1280(%[output]) \n\t" 74 "sh $zero, 1344(%[output]) \n\t" 75 "sh $zero, 1408(%[output]) \n\t" 76 "sh $zero, 1472(%[output]) \n\t" 77 "sh $zero, 1536(%[output]) \n\t" 78 "sh $zero, 1600(%[output]) \n\t" 79 "sh $zero, 1664(%[output]) \n\t" 80 "sh $zero, 1728(%[output]) \n\t" 81 "sh $zero, 1792(%[output]) \n\t" 82 "sh $zero, 1856(%[output]) \n\t" 83 "sh $zero, 1920(%[output]) \n\t" 84 "sh $zero, 1984(%[output]) \n\t" 85 86 : 87 : [output] "r" (output) 88 ); 89 90 output += 1; 91 92 continue; 93 } 94 95 /* prefetch row */ 96 prefetch_load((const uint8_t *)(input + 32)); 97 prefetch_load((const uint8_t *)(input + 48)); 98 99 __asm__ __volatile__ ( 100 "lh %[load1], 2(%[input]) \n\t" 101 "lh %[load2], 62(%[input]) \n\t" 102 "lh %[load3], 34(%[input]) \n\t" 103 "lh %[load4], 30(%[input]) \n\t" 104 105 "mtlo %[const_2_power_13], $ac1 \n\t" 106 "mthi $zero, $ac1 \n\t" 107 "mtlo %[const_2_power_13], $ac3 \n\t" 108 "mthi $zero, $ac3 \n\t" 109 110 "madd $ac1, %[load1], %[cospi_31_64] \n\t" 111 "msub $ac1, %[load2], %[cospi_1_64] \n\t" 112 "extp %[temp0], $ac1, 31 \n\t" 113 114 "madd $ac3, %[load1], %[cospi_1_64] \n\t" 115 "madd $ac3, %[load2], %[cospi_31_64] \n\t" 116 "extp %[temp3], $ac3, 31 \n\t" 117 118 "mtlo %[const_2_power_13], $ac1 \n\t" 119 "mthi $zero, $ac1 \n\t" 120 "mtlo %[const_2_power_13], $ac2 \n\t" 121 "mthi $zero, $ac2 \n\t" 122 123 "madd $ac2, %[load3], %[cospi_15_64] \n\t" 124 "msub $ac2, %[load4], %[cospi_17_64] \n\t" 125 "extp %[temp1], $ac2, 31 \n\t" 126 127 "madd $ac1, %[load3], %[cospi_17_64] \n\t" 128 "madd $ac1, %[load4], %[cospi_15_64] \n\t" 129 "extp %[temp2], $ac1, 31 \n\t" 130 131 "mtlo %[const_2_power_13], $ac1 \n\t" 132 "mthi $zero, $ac1 \n\t" 133 "mtlo %[const_2_power_13], $ac3 \n\t" 134 "mthi $zero, $ac3 \n\t" 135 136 "sub %[load1], %[temp3], %[temp2] \n\t" 137 "sub %[load2], %[temp0], %[temp1] \n\t" 138 139 "madd $ac1, %[load1], %[cospi_28_64] \n\t" 140 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 141 "madd $ac3, %[load1], %[cospi_4_64] \n\t" 142 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 143 144 "extp %[step1_17], $ac1, 31 \n\t" 145 "extp %[step1_30], $ac3, 31 \n\t" 146 "add %[step1_16], %[temp0], %[temp1] \n\t" 147 "add %[step1_31], %[temp2], %[temp3] \n\t" 148 149 : [load1] "=&r" (load1), [load2] "=&r" (load2), 150 [load3] "=&r" (load3), [load4] "=&r" (load4), 151 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 152 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 153 [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), 154 [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) 155 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 156 [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), 157 [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), 158 [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) 159 ); 160 161 __asm__ __volatile__ ( 162 "lh %[load1], 18(%[input]) \n\t" 163 "lh %[load2], 46(%[input]) \n\t" 164 "lh %[load3], 50(%[input]) \n\t" 165 "lh %[load4], 14(%[input]) \n\t" 166 167 "mtlo %[const_2_power_13], $ac1 \n\t" 168 "mthi $zero, $ac1 \n\t" 169 "mtlo %[const_2_power_13], $ac3 \n\t" 170 "mthi $zero, $ac3 \n\t" 171 172 "madd $ac1, %[load1], %[cospi_23_64] \n\t" 173 "msub $ac1, %[load2], %[cospi_9_64] \n\t" 174 "extp %[temp0], $ac1, 31 \n\t" 175 176 "madd $ac3, %[load1], %[cospi_9_64] \n\t" 177 "madd $ac3, %[load2], %[cospi_23_64] \n\t" 178 "extp %[temp3], $ac3, 31 \n\t" 179 180 "mtlo %[const_2_power_13], $ac1 \n\t" 181 "mthi $zero, $ac1 \n\t" 182 "mtlo %[const_2_power_13], $ac2 \n\t" 183 "mthi $zero, $ac2 \n\t" 184 185 "madd $ac2, %[load3], %[cospi_7_64] \n\t" 186 "msub $ac2, %[load4], %[cospi_25_64] \n\t" 187 "extp %[temp1], $ac2, 31 \n\t" 188 189 "madd $ac1, %[load3], %[cospi_25_64] \n\t" 190 "madd $ac1, %[load4], %[cospi_7_64] \n\t" 191 "extp %[temp2], $ac1, 31 \n\t" 192 193 "mtlo %[const_2_power_13], $ac1 \n\t" 194 "mthi $zero, $ac1 \n\t" 195 "mtlo %[const_2_power_13], $ac3 \n\t" 196 "mthi $zero, $ac3 \n\t" 197 198 "sub %[load1], %[temp1], %[temp0] \n\t" 199 "sub %[load2], %[temp2], %[temp3] \n\t" 200 201 "msub $ac1, %[load1], %[cospi_28_64] \n\t" 202 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 203 "msub $ac3, %[load1], %[cospi_4_64] \n\t" 204 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 205 206 "extp %[step1_18], $ac1, 31 \n\t" 207 "extp %[step1_29], $ac3, 31 \n\t" 208 "add %[step1_19], %[temp0], %[temp1] \n\t" 209 "add %[step1_28], %[temp2], %[temp3] \n\t" 210 211 : [load1] "=&r" (load1), [load2] "=&r" (load2), 212 [load3] "=&r" (load3), [load4] "=&r" (load4), 213 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 214 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 215 [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), 216 [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) 217 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 218 [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), 219 [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), 220 [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) 221 ); 222 223 __asm__ __volatile__ ( 224 "lh %[load1], 10(%[input]) \n\t" 225 "lh %[load2], 54(%[input]) \n\t" 226 "lh %[load3], 42(%[input]) \n\t" 227 "lh %[load4], 22(%[input]) \n\t" 228 229 "mtlo %[const_2_power_13], $ac1 \n\t" 230 "mthi $zero, $ac1 \n\t" 231 "mtlo %[const_2_power_13], $ac3 \n\t" 232 "mthi $zero, $ac3 \n\t" 233 234 "madd $ac1, %[load1], %[cospi_27_64] \n\t" 235 "msub $ac1, %[load2], %[cospi_5_64] \n\t" 236 "extp %[temp0], $ac1, 31 \n\t" 237 238 "madd $ac3, %[load1], %[cospi_5_64] \n\t" 239 "madd $ac3, %[load2], %[cospi_27_64] \n\t" 240 "extp %[temp3], $ac3, 31 \n\t" 241 242 "mtlo %[const_2_power_13], $ac1 \n\t" 243 "mthi $zero, $ac1 \n\t" 244 "mtlo %[const_2_power_13], $ac2 \n\t" 245 "mthi $zero, $ac2 \n\t" 246 247 "madd $ac2, %[load3], %[cospi_11_64] \n\t" 248 "msub $ac2, %[load4], %[cospi_21_64] \n\t" 249 "extp %[temp1], $ac2, 31 \n\t" 250 251 "madd $ac1, %[load3], %[cospi_21_64] \n\t" 252 "madd $ac1, %[load4], %[cospi_11_64] \n\t" 253 "extp %[temp2], $ac1, 31 \n\t" 254 255 "mtlo %[const_2_power_13], $ac1 \n\t" 256 "mthi $zero, $ac1 \n\t" 257 "mtlo %[const_2_power_13], $ac3 \n\t" 258 "mthi $zero, $ac3 \n\t" 259 260 "sub %[load1], %[temp0], %[temp1] \n\t" 261 "sub %[load2], %[temp3], %[temp2] \n\t" 262 263 "madd $ac1, %[load2], %[cospi_12_64] \n\t" 264 "msub $ac1, %[load1], %[cospi_20_64] \n\t" 265 "madd $ac3, %[load1], %[cospi_12_64] \n\t" 266 "madd $ac3, %[load2], %[cospi_20_64] \n\t" 267 268 "extp %[step1_21], $ac1, 31 \n\t" 269 "extp %[step1_26], $ac3, 31 \n\t" 270 "add %[step1_20], %[temp0], %[temp1] \n\t" 271 "add %[step1_27], %[temp2], %[temp3] \n\t" 272 273 : [load1] "=&r" (load1), [load2] "=&r" (load2), 274 [load3] "=&r" (load3), [load4] "=&r" (load4), 275 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 276 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 277 [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), 278 [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) 279 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 280 [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), 281 [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), 282 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) 283 ); 284 285 __asm__ __volatile__ ( 286 "lh %[load1], 26(%[input]) \n\t" 287 "lh %[load2], 38(%[input]) \n\t" 288 "lh %[load3], 58(%[input]) \n\t" 289 "lh %[load4], 6(%[input]) \n\t" 290 291 "mtlo %[const_2_power_13], $ac1 \n\t" 292 "mthi $zero, $ac1 \n\t" 293 "mtlo %[const_2_power_13], $ac3 \n\t" 294 "mthi $zero, $ac3 \n\t" 295 296 "madd $ac1, %[load1], %[cospi_19_64] \n\t" 297 "msub $ac1, %[load2], %[cospi_13_64] \n\t" 298 "extp %[temp0], $ac1, 31 \n\t" 299 300 "madd $ac3, %[load1], %[cospi_13_64] \n\t" 301 "madd $ac3, %[load2], %[cospi_19_64] \n\t" 302 "extp %[temp3], $ac3, 31 \n\t" 303 304 "mtlo %[const_2_power_13], $ac1 \n\t" 305 "mthi $zero, $ac1 \n\t" 306 "mtlo %[const_2_power_13], $ac2 \n\t" 307 "mthi $zero, $ac2 \n\t" 308 309 "madd $ac2, %[load3], %[cospi_3_64] \n\t" 310 "msub $ac2, %[load4], %[cospi_29_64] \n\t" 311 "extp %[temp1], $ac2, 31 \n\t" 312 313 "madd $ac1, %[load3], %[cospi_29_64] \n\t" 314 "madd $ac1, %[load4], %[cospi_3_64] \n\t" 315 "extp %[temp2], $ac1, 31 \n\t" 316 317 "mtlo %[const_2_power_13], $ac1 \n\t" 318 "mthi $zero, $ac1 \n\t" 319 "mtlo %[const_2_power_13], $ac3 \n\t" 320 "mthi $zero, $ac3 \n\t" 321 322 "sub %[load1], %[temp1], %[temp0] \n\t" 323 "sub %[load2], %[temp2], %[temp3] \n\t" 324 325 "msub $ac1, %[load1], %[cospi_12_64] \n\t" 326 "msub $ac1, %[load2], %[cospi_20_64] \n\t" 327 "msub $ac3, %[load1], %[cospi_20_64] \n\t" 328 "madd $ac3, %[load2], %[cospi_12_64] \n\t" 329 330 "extp %[step1_22], $ac1, 31 \n\t" 331 "extp %[step1_25], $ac3, 31 \n\t" 332 "add %[step1_23], %[temp0], %[temp1] \n\t" 333 "add %[step1_24], %[temp2], %[temp3] \n\t" 334 335 : [load1] "=&r" (load1), [load2] "=&r" (load2), 336 [load3] "=&r" (load3), [load4] "=&r" (load4), 337 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 338 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 339 [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), 340 [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) 341 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 342 [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), 343 [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), 344 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) 345 ); 346 347 __asm__ __volatile__ ( 348 "lh %[load1], 4(%[input]) \n\t" 349 "lh %[load2], 60(%[input]) \n\t" 350 "lh %[load3], 36(%[input]) \n\t" 351 "lh %[load4], 28(%[input]) \n\t" 352 353 "mtlo %[const_2_power_13], $ac1 \n\t" 354 "mthi $zero, $ac1 \n\t" 355 "mtlo %[const_2_power_13], $ac3 \n\t" 356 "mthi $zero, $ac3 \n\t" 357 358 "madd $ac1, %[load1], %[cospi_30_64] \n\t" 359 "msub $ac1, %[load2], %[cospi_2_64] \n\t" 360 "extp %[temp0], $ac1, 31 \n\t" 361 362 "madd $ac3, %[load1], %[cospi_2_64] \n\t" 363 "madd $ac3, %[load2], %[cospi_30_64] \n\t" 364 "extp %[temp3], $ac3, 31 \n\t" 365 366 "mtlo %[const_2_power_13], $ac1 \n\t" 367 "mthi $zero, $ac1 \n\t" 368 "mtlo %[const_2_power_13], $ac2 \n\t" 369 "mthi $zero, $ac2 \n\t" 370 371 "madd $ac2, %[load3], %[cospi_14_64] \n\t" 372 "msub $ac2, %[load4], %[cospi_18_64] \n\t" 373 "extp %[temp1], $ac2, 31 \n\t" 374 375 "madd $ac1, %[load3], %[cospi_18_64] \n\t" 376 "madd $ac1, %[load4], %[cospi_14_64] \n\t" 377 "extp %[temp2], $ac1, 31 \n\t" 378 379 "mtlo %[const_2_power_13], $ac1 \n\t" 380 "mthi $zero, $ac1 \n\t" 381 "mtlo %[const_2_power_13], $ac3 \n\t" 382 "mthi $zero, $ac3 \n\t" 383 384 "sub %[load1], %[temp0], %[temp1] \n\t" 385 "sub %[load2], %[temp3], %[temp2] \n\t" 386 387 "msub $ac1, %[load1], %[cospi_8_64] \n\t" 388 "madd $ac1, %[load2], %[cospi_24_64] \n\t" 389 "madd $ac3, %[load1], %[cospi_24_64] \n\t" 390 "madd $ac3, %[load2], %[cospi_8_64] \n\t" 391 392 "extp %[step2_9], $ac1, 31 \n\t" 393 "extp %[step2_14], $ac3, 31 \n\t" 394 "add %[step2_8], %[temp0], %[temp1] \n\t" 395 "add %[step2_15], %[temp2], %[temp3] \n\t" 396 397 : [load1] "=&r" (load1), [load2] "=&r" (load2), 398 [load3] "=&r" (load3), [load4] "=&r" (load4), 399 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 400 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 401 [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), 402 [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) 403 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 404 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), 405 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), 406 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) 407 ); 408 409 __asm__ __volatile__ ( 410 "lh %[load1], 20(%[input]) \n\t" 411 "lh %[load2], 44(%[input]) \n\t" 412 "lh %[load3], 52(%[input]) \n\t" 413 "lh %[load4], 12(%[input]) \n\t" 414 415 "mtlo %[const_2_power_13], $ac1 \n\t" 416 "mthi $zero, $ac1 \n\t" 417 "mtlo %[const_2_power_13], $ac3 \n\t" 418 "mthi $zero, $ac3 \n\t" 419 420 "madd $ac1, %[load1], %[cospi_22_64] \n\t" 421 "msub $ac1, %[load2], %[cospi_10_64] \n\t" 422 "extp %[temp0], $ac1, 31 \n\t" 423 424 "madd $ac3, %[load1], %[cospi_10_64] \n\t" 425 "madd $ac3, %[load2], %[cospi_22_64] \n\t" 426 "extp %[temp3], $ac3, 31 \n\t" 427 428 "mtlo %[const_2_power_13], $ac1 \n\t" 429 "mthi $zero, $ac1 \n\t" 430 "mtlo %[const_2_power_13], $ac2 \n\t" 431 "mthi $zero, $ac2 \n\t" 432 433 "madd $ac2, %[load3], %[cospi_6_64] \n\t" 434 "msub $ac2, %[load4], %[cospi_26_64] \n\t" 435 "extp %[temp1], $ac2, 31 \n\t" 436 437 "madd $ac1, %[load3], %[cospi_26_64] \n\t" 438 "madd $ac1, %[load4], %[cospi_6_64] \n\t" 439 "extp %[temp2], $ac1, 31 \n\t" 440 441 "mtlo %[const_2_power_13], $ac1 \n\t" 442 "mthi $zero, $ac1 \n\t" 443 "mtlo %[const_2_power_13], $ac3 \n\t" 444 "mthi $zero, $ac3 \n\t" 445 446 "sub %[load1], %[temp1], %[temp0] \n\t" 447 "sub %[load2], %[temp2], %[temp3] \n\t" 448 449 "msub $ac1, %[load1], %[cospi_24_64] \n\t" 450 "msub $ac1, %[load2], %[cospi_8_64] \n\t" 451 "madd $ac3, %[load2], %[cospi_24_64] \n\t" 452 "msub $ac3, %[load1], %[cospi_8_64] \n\t" 453 454 "extp %[step2_10], $ac1, 31 \n\t" 455 "extp %[step2_13], $ac3, 31 \n\t" 456 "add %[step2_11], %[temp0], %[temp1] \n\t" 457 "add %[step2_12], %[temp2], %[temp3] \n\t" 458 459 : [load1] "=&r" (load1), [load2] "=&r" (load2), 460 [load3] "=&r" (load3), [load4] "=&r" (load4), 461 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 462 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 463 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), 464 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) 465 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 466 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), 467 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), 468 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) 469 ); 470 471 __asm__ __volatile__ ( 472 "mtlo %[const_2_power_13], $ac0 \n\t" 473 "mthi $zero, $ac0 \n\t" 474 "sub %[temp0], %[step2_14], %[step2_13] \n\t" 475 "sub %[temp0], %[temp0], %[step2_9] \n\t" 476 "add %[temp0], %[temp0], %[step2_10] \n\t" 477 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 478 479 "mtlo %[const_2_power_13], $ac1 \n\t" 480 "mthi $zero, $ac1 \n\t" 481 "sub %[temp1], %[step2_14], %[step2_13] \n\t" 482 "add %[temp1], %[temp1], %[step2_9] \n\t" 483 "sub %[temp1], %[temp1], %[step2_10] \n\t" 484 "madd $ac1, %[temp1], %[cospi_16_64] \n\t" 485 486 "mtlo %[const_2_power_13], $ac2 \n\t" 487 "mthi $zero, $ac2 \n\t" 488 "sub %[temp0], %[step2_15], %[step2_12] \n\t" 489 "sub %[temp0], %[temp0], %[step2_8] \n\t" 490 "add %[temp0], %[temp0], %[step2_11] \n\t" 491 "madd $ac2, %[temp0], %[cospi_16_64] \n\t" 492 493 "mtlo %[const_2_power_13], $ac3 \n\t" 494 "mthi $zero, $ac3 \n\t" 495 "sub %[temp1], %[step2_15], %[step2_12] \n\t" 496 "add %[temp1], %[temp1], %[step2_8] \n\t" 497 "sub %[temp1], %[temp1], %[step2_11] \n\t" 498 "madd $ac3, %[temp1], %[cospi_16_64] \n\t" 499 500 "add %[step3_8], %[step2_8], %[step2_11] \n\t" 501 "add %[step3_9], %[step2_9], %[step2_10] \n\t" 502 "add %[step3_14], %[step2_13], %[step2_14] \n\t" 503 "add %[step3_15], %[step2_12], %[step2_15] \n\t" 504 505 "extp %[step3_10], $ac0, 31 \n\t" 506 "extp %[step3_13], $ac1, 31 \n\t" 507 "extp %[step3_11], $ac2, 31 \n\t" 508 "extp %[step3_12], $ac3, 31 \n\t" 509 510 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 511 [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), 512 [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), 513 [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), 514 [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) 515 : [const_2_power_13] "r" (const_2_power_13), 516 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), 517 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), 518 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), 519 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15), 520 [cospi_16_64] "r" (cospi_16_64) 521 ); 522 523 step2_18 = step1_17 - step1_18; 524 step2_29 = step1_30 - step1_29; 525 526 __asm__ __volatile__ ( 527 "mtlo %[const_2_power_13], $ac0 \n\t" 528 "mthi $zero, $ac0 \n\t" 529 "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" 530 "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" 531 "extp %[step3_18], $ac0, 31 \n\t" 532 533 : [step3_18] "=r" (step3_18) 534 : [const_2_power_13] "r" (const_2_power_13), 535 [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), 536 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 537 ); 538 539 temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; 540 step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 541 542 step2_19 = step1_16 - step1_19; 543 step2_28 = step1_31 - step1_28; 544 545 __asm__ __volatile__ ( 546 "mtlo %[const_2_power_13], $ac0 \n\t" 547 "mthi $zero, $ac0 \n\t" 548 "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" 549 "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" 550 "extp %[step3_19], $ac0, 31 \n\t" 551 552 : [step3_19] "=r" (step3_19) 553 : [const_2_power_13] "r" (const_2_power_13), 554 [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), 555 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 556 ); 557 558 temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; 559 step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 560 561 step3_16 = step1_16 + step1_19; 562 step3_17 = step1_17 + step1_18; 563 step3_30 = step1_29 + step1_30; 564 step3_31 = step1_28 + step1_31; 565 566 step2_20 = step1_23 - step1_20; 567 step2_27 = step1_24 - step1_27; 568 569 __asm__ __volatile__ ( 570 "mtlo %[const_2_power_13], $ac0 \n\t" 571 "mthi $zero, $ac0 \n\t" 572 "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" 573 "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" 574 "extp %[step3_20], $ac0, 31 \n\t" 575 576 : [step3_20] "=r" (step3_20) 577 : [const_2_power_13] "r" (const_2_power_13), 578 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), 579 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 580 ); 581 582 temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; 583 step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 584 585 step2_21 = step1_22 - step1_21; 586 step2_26 = step1_25 - step1_26; 587 588 __asm__ __volatile__ ( 589 "mtlo %[const_2_power_13], $ac1 \n\t" 590 "mthi $zero, $ac1 \n\t" 591 "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" 592 "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" 593 "extp %[step3_21], $ac1, 31 \n\t" 594 595 : [step3_21] "=r" (step3_21) 596 : [const_2_power_13] "r" (const_2_power_13), 597 [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), 598 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 599 ); 600 601 temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; 602 step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 603 604 step3_22 = step1_21 + step1_22; 605 step3_23 = step1_20 + step1_23; 606 step3_24 = step1_24 + step1_27; 607 step3_25 = step1_25 + step1_26; 608 609 step2_16 = step3_16 + step3_23; 610 step2_17 = step3_17 + step3_22; 611 step2_18 = step3_18 + step3_21; 612 step2_19 = step3_19 + step3_20; 613 step2_20 = step3_19 - step3_20; 614 step2_21 = step3_18 - step3_21; 615 step2_22 = step3_17 - step3_22; 616 step2_23 = step3_16 - step3_23; 617 618 step2_24 = step3_31 - step3_24; 619 step2_25 = step3_30 - step3_25; 620 step2_26 = step3_29 - step3_26; 621 step2_27 = step3_28 - step3_27; 622 step2_28 = step3_28 + step3_27; 623 step2_29 = step3_29 + step3_26; 624 step2_30 = step3_30 + step3_25; 625 step2_31 = step3_31 + step3_24; 626 627 __asm__ __volatile__ ( 628 "lh %[load1], 0(%[input]) \n\t" 629 "lh %[load2], 32(%[input]) \n\t" 630 "lh %[load3], 16(%[input]) \n\t" 631 "lh %[load4], 48(%[input]) \n\t" 632 633 "mtlo %[const_2_power_13], $ac1 \n\t" 634 "mthi $zero, $ac1 \n\t" 635 "mtlo %[const_2_power_13], $ac2 \n\t" 636 "mthi $zero, $ac2 \n\t" 637 "add %[result1], %[load1], %[load2] \n\t" 638 "sub %[result2], %[load1], %[load2] \n\t" 639 "madd $ac1, %[result1], %[cospi_16_64] \n\t" 640 "madd $ac2, %[result2], %[cospi_16_64] \n\t" 641 "extp %[temp0], $ac1, 31 \n\t" 642 "extp %[temp1], $ac2, 31 \n\t" 643 644 "mtlo %[const_2_power_13], $ac3 \n\t" 645 "mthi $zero, $ac3 \n\t" 646 "madd $ac3, %[load3], %[cospi_24_64] \n\t" 647 "msub $ac3, %[load4], %[cospi_8_64] \n\t" 648 "extp %[temp2], $ac3, 31 \n\t" 649 650 "mtlo %[const_2_power_13], $ac1 \n\t" 651 "mthi $zero, $ac1 \n\t" 652 "madd $ac1, %[load3], %[cospi_8_64] \n\t" 653 "madd $ac1, %[load4], %[cospi_24_64] \n\t" 654 "extp %[temp3], $ac1, 31 \n\t" 655 656 "add %[step1_0], %[temp0], %[temp3] \n\t" 657 "add %[step1_1], %[temp1], %[temp2] \n\t" 658 "sub %[step1_2], %[temp1], %[temp2] \n\t" 659 "sub %[step1_3], %[temp0], %[temp3] \n\t" 660 661 : [load1] "=&r" (load1), [load2] "=&r" (load2), 662 [load3] "=&r" (load3), [load4] "=&r" (load4), 663 [result1] "=&r" (result1), [result2] "=&r" (result2), 664 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 665 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 666 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), 667 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) 668 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 669 [cospi_16_64] "r" (cospi_16_64), 670 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 671 672 ); 673 674 __asm__ __volatile__ ( 675 "lh %[load1], 8(%[input]) \n\t" 676 "lh %[load2], 56(%[input]) \n\t" 677 "lh %[load3], 40(%[input]) \n\t" 678 "lh %[load4], 24(%[input]) \n\t" 679 680 "mtlo %[const_2_power_13], $ac1 \n\t" 681 "mthi $zero, $ac1 \n\t" 682 "mtlo %[const_2_power_13], $ac3 \n\t" 683 "mthi $zero, $ac3 \n\t" 684 685 "madd $ac1, %[load1], %[cospi_28_64] \n\t" 686 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 687 "extp %[temp0], $ac1, 31 \n\t" 688 689 "madd $ac3, %[load1], %[cospi_4_64] \n\t" 690 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 691 "extp %[temp3], $ac3, 31 \n\t" 692 693 "mtlo %[const_2_power_13], $ac1 \n\t" 694 "mthi $zero, $ac1 \n\t" 695 "mtlo %[const_2_power_13], $ac2 \n\t" 696 "mthi $zero, $ac2 \n\t" 697 698 "madd $ac2, %[load3], %[cospi_12_64] \n\t" 699 "msub $ac2, %[load4], %[cospi_20_64] \n\t" 700 "extp %[temp1], $ac2, 31 \n\t" 701 702 "madd $ac1, %[load3], %[cospi_20_64] \n\t" 703 "madd $ac1, %[load4], %[cospi_12_64] \n\t" 704 "extp %[temp2], $ac1, 31 \n\t" 705 706 "mtlo %[const_2_power_13], $ac1 \n\t" 707 "mthi $zero, $ac1 \n\t" 708 "mtlo %[const_2_power_13], $ac3 \n\t" 709 "mthi $zero, $ac3 \n\t" 710 711 "sub %[load1], %[temp3], %[temp2] \n\t" 712 "sub %[load1], %[load1], %[temp0] \n\t" 713 "add %[load1], %[load1], %[temp1] \n\t" 714 715 "sub %[load2], %[temp0], %[temp1] \n\t" 716 "sub %[load2], %[load2], %[temp2] \n\t" 717 "add %[load2], %[load2], %[temp3] \n\t" 718 719 "madd $ac1, %[load1], %[cospi_16_64] \n\t" 720 "madd $ac3, %[load2], %[cospi_16_64] \n\t" 721 722 "extp %[step1_5], $ac1, 31 \n\t" 723 "extp %[step1_6], $ac3, 31 \n\t" 724 "add %[step1_4], %[temp0], %[temp1] \n\t" 725 "add %[step1_7], %[temp3], %[temp2] \n\t" 726 727 : [load1] "=&r" (load1), [load2] "=&r" (load2), 728 [load3] "=&r" (load3), [load4] "=&r" (load4), 729 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 730 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 731 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), 732 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) 733 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 734 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), 735 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), 736 [cospi_16_64] "r" (cospi_16_64) 737 ); 738 739 step2_0 = step1_0 + step1_7; 740 step2_1 = step1_1 + step1_6; 741 step2_2 = step1_2 + step1_5; 742 step2_3 = step1_3 + step1_4; 743 step2_4 = step1_3 - step1_4; 744 step2_5 = step1_2 - step1_5; 745 step2_6 = step1_1 - step1_6; 746 step2_7 = step1_0 - step1_7; 747 748 step1_0 = step2_0 + step3_15; 749 step1_1 = step2_1 + step3_14; 750 step1_2 = step2_2 + step3_13; 751 step1_3 = step2_3 + step3_12; 752 step1_4 = step2_4 + step3_11; 753 step1_5 = step2_5 + step3_10; 754 step1_6 = step2_6 + step3_9; 755 step1_7 = step2_7 + step3_8; 756 step1_8 = step2_7 - step3_8; 757 step1_9 = step2_6 - step3_9; 758 step1_10 = step2_5 - step3_10; 759 step1_11 = step2_4 - step3_11; 760 step1_12 = step2_3 - step3_12; 761 step1_13 = step2_2 - step3_13; 762 step1_14 = step2_1 - step3_14; 763 step1_15 = step2_0 - step3_15; 764 765 __asm__ __volatile__ ( 766 "sub %[temp0], %[step2_27], %[step2_20] \n\t" 767 "mtlo %[const_2_power_13], $ac0 \n\t" 768 "mthi $zero, $ac0 \n\t" 769 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 770 "extp %[step1_20], $ac0, 31 \n\t" 771 772 : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) 773 : [const_2_power_13] "r" (const_2_power_13), 774 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), 775 [cospi_16_64] "r" (cospi_16_64) 776 ); 777 778 temp21 = (step2_20 + step2_27) * cospi_16_64; 779 step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 780 781 __asm__ __volatile__ ( 782 "sub %[temp0], %[step2_26], %[step2_21] \n\t" 783 "mtlo %[const_2_power_13], $ac0 \n\t" 784 "mthi $zero, $ac0 \n\t" 785 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 786 "extp %[step1_21], $ac0, 31 \n\t" 787 788 : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) 789 : [const_2_power_13] "r" (const_2_power_13), 790 [step2_26] "r" (step2_26), [step2_21] "r" (step2_21), 791 [cospi_16_64] "r" (cospi_16_64) 792 ); 793 794 temp21 = (step2_21 + step2_26) * cospi_16_64; 795 step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 796 797 __asm__ __volatile__ ( 798 "sub %[temp0], %[step2_25], %[step2_22] \n\t" 799 "mtlo %[const_2_power_13], $ac0 \n\t" 800 "mthi $zero, $ac0 \n\t" 801 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 802 "extp %[step1_22], $ac0, 31 \n\t" 803 804 : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) 805 : [const_2_power_13] "r" (const_2_power_13), 806 [step2_25] "r" (step2_25), [step2_22] "r" (step2_22), 807 [cospi_16_64] "r" (cospi_16_64) 808 ); 809 810 temp21 = (step2_22 + step2_25) * cospi_16_64; 811 step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 812 813 __asm__ __volatile__ ( 814 "sub %[temp0], %[step2_24], %[step2_23] \n\t" 815 "mtlo %[const_2_power_13], $ac0 \n\t" 816 "mthi $zero, $ac0 \n\t" 817 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 818 "extp %[step1_23], $ac0, 31 \n\t" 819 820 : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) 821 : [const_2_power_13] "r" (const_2_power_13), 822 [step2_24] "r" (step2_24), [step2_23] "r" (step2_23), 823 [cospi_16_64] "r" (cospi_16_64) 824 ); 825 826 temp21 = (step2_23 + step2_24) * cospi_16_64; 827 step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 828 829 // final stage 830 output[0 * 32] = step1_0 + step2_31; 831 output[1 * 32] = step1_1 + step2_30; 832 output[2 * 32] = step1_2 + step2_29; 833 output[3 * 32] = step1_3 + step2_28; 834 output[4 * 32] = step1_4 + step1_27; 835 output[5 * 32] = step1_5 + step1_26; 836 output[6 * 32] = step1_6 + step1_25; 837 output[7 * 32] = step1_7 + step1_24; 838 output[8 * 32] = step1_8 + step1_23; 839 output[9 * 32] = step1_9 + step1_22; 840 output[10 * 32] = step1_10 + step1_21; 841 output[11 * 32] = step1_11 + step1_20; 842 output[12 * 32] = step1_12 + step2_19; 843 output[13 * 32] = step1_13 + step2_18; 844 output[14 * 32] = step1_14 + step2_17; 845 output[15 * 32] = step1_15 + step2_16; 846 output[16 * 32] = step1_15 - step2_16; 847 output[17 * 32] = step1_14 - step2_17; 848 output[18 * 32] = step1_13 - step2_18; 849 output[19 * 32] = step1_12 - step2_19; 850 output[20 * 32] = step1_11 - step1_20; 851 output[21 * 32] = step1_10 - step1_21; 852 output[22 * 32] = step1_9 - step1_22; 853 output[23 * 32] = step1_8 - step1_23; 854 output[24 * 32] = step1_7 - step1_24; 855 output[25 * 32] = step1_6 - step1_25; 856 output[26 * 32] = step1_5 - step1_26; 857 output[27 * 32] = step1_4 - step1_27; 858 output[28 * 32] = step1_3 - step2_28; 859 output[29 * 32] = step1_2 - step2_29; 860 output[30 * 32] = step1_1 - step2_30; 861 output[31 * 32] = step1_0 - step2_31; 862 863 input += 32; 864 output += 1; 865 } 866} 867 868void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, 869 int dest_stride) { 870 DECLARE_ALIGNED(32, int16_t, out[32 * 32]); 871 int16_t *outptr = out; 872 uint32_t pos = 45; 873 874 /* bit positon for extract from acc */ 875 __asm__ __volatile__ ( 876 "wrdsp %[pos], 1 \n\t" 877 : 878 : [pos] "r" (pos) 879 ); 880 881 // Rows 882 idct32_rows_dspr2(input, outptr, 32); 883 884 // Columns 885 vpx_idct32_cols_add_blk_dspr2(out, dest, dest_stride); 886} 887 888void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, 889 int stride) { 890 DECLARE_ALIGNED(32, int16_t, out[32 * 32]); 891 int16_t *outptr = out; 892 uint32_t i; 893 uint32_t pos = 45; 894 895 /* bit positon for extract from acc */ 896 __asm__ __volatile__ ( 897 "wrdsp %[pos], 1 \n\t" 898 : 899 : [pos] "r" (pos) 900 ); 901 902 // Rows 903 idct32_rows_dspr2(input, outptr, 8); 904 905 outptr += 8; 906 __asm__ __volatile__ ( 907 "sw $zero, 0(%[outptr]) \n\t" 908 "sw $zero, 4(%[outptr]) \n\t" 909 "sw $zero, 8(%[outptr]) \n\t" 910 "sw $zero, 12(%[outptr]) \n\t" 911 "sw $zero, 16(%[outptr]) \n\t" 912 "sw $zero, 20(%[outptr]) \n\t" 913 "sw $zero, 24(%[outptr]) \n\t" 914 "sw $zero, 28(%[outptr]) \n\t" 915 "sw $zero, 32(%[outptr]) \n\t" 916 "sw $zero, 36(%[outptr]) \n\t" 917 "sw $zero, 40(%[outptr]) \n\t" 918 "sw $zero, 44(%[outptr]) \n\t" 919 920 : 921 : [outptr] "r" (outptr) 922 ); 923 924 for (i = 0; i < 31; ++i) { 925 outptr += 32; 926 927 __asm__ __volatile__ ( 928 "sw $zero, 0(%[outptr]) \n\t" 929 "sw $zero, 4(%[outptr]) \n\t" 930 "sw $zero, 8(%[outptr]) \n\t" 931 "sw $zero, 12(%[outptr]) \n\t" 932 "sw $zero, 16(%[outptr]) \n\t" 933 "sw $zero, 20(%[outptr]) \n\t" 934 "sw $zero, 24(%[outptr]) \n\t" 935 "sw $zero, 28(%[outptr]) \n\t" 936 "sw $zero, 32(%[outptr]) \n\t" 937 "sw $zero, 36(%[outptr]) \n\t" 938 "sw $zero, 40(%[outptr]) \n\t" 939 "sw $zero, 44(%[outptr]) \n\t" 940 941 : 942 : [outptr] "r" (outptr) 943 ); 944 } 945 946 // Columns 947 vpx_idct32_cols_add_blk_dspr2(out, dest, stride); 948} 949 950void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, 951 int stride) { 952 int r, out; 953 int32_t a1, absa1; 954 int32_t vector_a1; 955 int32_t t1, t2, t3, t4; 956 int32_t vector_1, vector_2, vector_3, vector_4; 957 uint32_t pos = 45; 958 959 /* bit positon for extract from acc */ 960 __asm__ __volatile__ ( 961 "wrdsp %[pos], 1 \n\t" 962 963 : 964 : [pos] "r" (pos) 965 ); 966 967 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); 968 __asm__ __volatile__ ( 969 "addi %[out], %[out], 32 \n\t" 970 "sra %[a1], %[out], 6 \n\t" 971 972 : [out] "+r" (out), [a1] "=r" (a1) 973 : 974 ); 975 976 if (a1 < 0) { 977 /* use quad-byte 978 * input and output memory are four byte aligned */ 979 __asm__ __volatile__ ( 980 "abs %[absa1], %[a1] \n\t" 981 "replv.qb %[vector_a1], %[absa1] \n\t" 982 983 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) 984 : [a1] "r" (a1) 985 ); 986 987 for (r = 32; r--;) { 988 __asm__ __volatile__ ( 989 "lw %[t1], 0(%[dest]) \n\t" 990 "lw %[t2], 4(%[dest]) \n\t" 991 "lw %[t3], 8(%[dest]) \n\t" 992 "lw %[t4], 12(%[dest]) \n\t" 993 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 994 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 995 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 996 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 997 "sw %[vector_1], 0(%[dest]) \n\t" 998 "sw %[vector_2], 4(%[dest]) \n\t" 999 "sw %[vector_3], 8(%[dest]) \n\t" 1000 "sw %[vector_4], 12(%[dest]) \n\t" 1001 1002 "lw %[t1], 16(%[dest]) \n\t" 1003 "lw %[t2], 20(%[dest]) \n\t" 1004 "lw %[t3], 24(%[dest]) \n\t" 1005 "lw %[t4], 28(%[dest]) \n\t" 1006 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 1007 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 1008 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 1009 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1010 "sw %[vector_1], 16(%[dest]) \n\t" 1011 "sw %[vector_2], 20(%[dest]) \n\t" 1012 "sw %[vector_3], 24(%[dest]) \n\t" 1013 "sw %[vector_4], 28(%[dest]) \n\t" 1014 1015 "add %[dest], %[dest], %[stride] \n\t" 1016 1017 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), 1018 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), 1019 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), 1020 [dest] "+&r" (dest) 1021 : [stride] "r" (stride), [vector_a1] "r" (vector_a1) 1022 ); 1023 } 1024 } else { 1025 /* use quad-byte 1026 * input and output memory are four byte aligned */ 1027 __asm__ __volatile__ ( 1028 "replv.qb %[vector_a1], %[a1] \n\t" 1029 1030 : [vector_a1] "=r" (vector_a1) 1031 : [a1] "r" (a1) 1032 ); 1033 1034 for (r = 32; r--;) { 1035 __asm__ __volatile__ ( 1036 "lw %[t1], 0(%[dest]) \n\t" 1037 "lw %[t2], 4(%[dest]) \n\t" 1038 "lw %[t3], 8(%[dest]) \n\t" 1039 "lw %[t4], 12(%[dest]) \n\t" 1040 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 1041 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 1042 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 1043 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1044 "sw %[vector_1], 0(%[dest]) \n\t" 1045 "sw %[vector_2], 4(%[dest]) \n\t" 1046 "sw %[vector_3], 8(%[dest]) \n\t" 1047 "sw %[vector_4], 12(%[dest]) \n\t" 1048 1049 "lw %[t1], 16(%[dest]) \n\t" 1050 "lw %[t2], 20(%[dest]) \n\t" 1051 "lw %[t3], 24(%[dest]) \n\t" 1052 "lw %[t4], 28(%[dest]) \n\t" 1053 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 1054 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 1055 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 1056 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1057 "sw %[vector_1], 16(%[dest]) \n\t" 1058 "sw %[vector_2], 20(%[dest]) \n\t" 1059 "sw %[vector_3], 24(%[dest]) \n\t" 1060 "sw %[vector_4], 28(%[dest]) \n\t" 1061 1062 "add %[dest], %[dest], %[stride] \n\t" 1063 1064 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), 1065 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), 1066 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), 1067 [dest] "+&r" (dest) 1068 : [stride] "r" (stride), [vector_a1] "r" (vector_a1) 1069 ); 1070 } 1071 } 1072} 1073#endif // #if HAVE_DSPR2 1074