1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_config.h" 15#include "./vp9_rtcd.h" 16#include "vp9/common/vp9_common.h" 17#include "vp9/common/vp9_blockd.h" 18#include "vp9/common/vp9_idct.h" 19#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 20 21#if HAVE_DSPR2 22static void idct32_rows_dspr2(const int16_t *input, int16_t *output, 23 uint32_t no_rows) { 24 int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; 25 int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; 26 int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; 27 int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; 28 int16_t step1_28, step1_29, step1_30, step1_31; 29 int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; 30 int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; 31 int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; 32 int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; 33 int16_t step2_28, step2_29, step2_30, step2_31; 34 int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; 35 int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; 36 int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; 37 int16_t step3_29, step3_30, step3_31; 38 int temp0, temp1, temp2, temp3; 39 int load1, load2, load3, load4; 40 int result1, result2; 41 int temp21; 42 int i; 43 const int const_2_power_13 = 8192; 44 const int32_t *input_int; 45 46 for (i = no_rows; i--; ) { 47 input_int = (const int32_t *)input; 48 49 if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | 50 input_int[4] | input_int[5] | input_int[6] | input_int[7] | 51 input_int[8] | input_int[9] | input_int[10] | input_int[11] | 52 input_int[12] | input_int[13] | input_int[14] | input_int[15])) { 53 input += 32; 54 55 __asm__ __volatile__ ( 56 "sh $zero, 0(%[output]) \n\t" 57 "sh $zero, 64(%[output]) \n\t" 58 "sh $zero, 128(%[output]) \n\t" 59 "sh $zero, 192(%[output]) \n\t" 60 "sh $zero, 256(%[output]) \n\t" 61 "sh $zero, 320(%[output]) \n\t" 62 "sh $zero, 384(%[output]) \n\t" 63 "sh $zero, 448(%[output]) \n\t" 64 "sh $zero, 512(%[output]) \n\t" 65 "sh $zero, 576(%[output]) \n\t" 66 "sh $zero, 640(%[output]) \n\t" 67 "sh $zero, 704(%[output]) \n\t" 68 "sh $zero, 768(%[output]) \n\t" 69 "sh $zero, 832(%[output]) \n\t" 70 "sh $zero, 896(%[output]) \n\t" 71 "sh $zero, 960(%[output]) \n\t" 72 "sh $zero, 1024(%[output]) \n\t" 73 "sh $zero, 1088(%[output]) \n\t" 74 "sh $zero, 1152(%[output]) \n\t" 75 "sh $zero, 1216(%[output]) \n\t" 76 "sh $zero, 1280(%[output]) \n\t" 77 "sh $zero, 1344(%[output]) \n\t" 78 "sh $zero, 1408(%[output]) \n\t" 79 "sh $zero, 1472(%[output]) \n\t" 80 "sh $zero, 1536(%[output]) \n\t" 81 "sh $zero, 1600(%[output]) \n\t" 82 "sh $zero, 1664(%[output]) \n\t" 83 "sh $zero, 1728(%[output]) \n\t" 84 "sh $zero, 1792(%[output]) \n\t" 85 "sh $zero, 1856(%[output]) \n\t" 86 "sh $zero, 1920(%[output]) \n\t" 87 "sh $zero, 1984(%[output]) \n\t" 88 89 : 90 : [output] "r" (output) 91 ); 92 93 output += 1; 94 95 continue; 96 } 97 98 /* prefetch row */ 99 vp9_prefetch_load((const uint8_t *)(input + 32)); 100 vp9_prefetch_load((const uint8_t *)(input + 48)); 101 102 __asm__ __volatile__ ( 103 "lh %[load1], 2(%[input]) \n\t" 104 "lh %[load2], 62(%[input]) \n\t" 105 "lh %[load3], 34(%[input]) \n\t" 106 "lh %[load4], 30(%[input]) \n\t" 107 108 "mtlo %[const_2_power_13], $ac1 \n\t" 109 "mthi $zero, $ac1 \n\t" 110 "mtlo %[const_2_power_13], $ac3 \n\t" 111 "mthi $zero, $ac3 \n\t" 112 113 "madd $ac1, %[load1], %[cospi_31_64] \n\t" 114 "msub $ac1, %[load2], %[cospi_1_64] \n\t" 115 "extp %[temp0], $ac1, 31 \n\t" 116 117 "madd $ac3, %[load1], %[cospi_1_64] \n\t" 118 "madd $ac3, %[load2], %[cospi_31_64] \n\t" 119 "extp %[temp3], $ac3, 31 \n\t" 120 121 "mtlo %[const_2_power_13], $ac1 \n\t" 122 "mthi $zero, $ac1 \n\t" 123 "mtlo %[const_2_power_13], $ac2 \n\t" 124 "mthi $zero, $ac2 \n\t" 125 126 "madd $ac2, %[load3], %[cospi_15_64] \n\t" 127 "msub $ac2, %[load4], %[cospi_17_64] \n\t" 128 "extp %[temp1], $ac2, 31 \n\t" 129 130 "madd $ac1, %[load3], %[cospi_17_64] \n\t" 131 "madd $ac1, %[load4], %[cospi_15_64] \n\t" 132 "extp %[temp2], $ac1, 31 \n\t" 133 134 "mtlo %[const_2_power_13], $ac1 \n\t" 135 "mthi $zero, $ac1 \n\t" 136 "mtlo %[const_2_power_13], $ac3 \n\t" 137 "mthi $zero, $ac3 \n\t" 138 139 "sub %[load1], %[temp3], %[temp2] \n\t" 140 "sub %[load2], %[temp0], %[temp1] \n\t" 141 142 "madd $ac1, %[load1], %[cospi_28_64] \n\t" 143 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 144 "madd $ac3, %[load1], %[cospi_4_64] \n\t" 145 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 146 147 "extp %[step1_17], $ac1, 31 \n\t" 148 "extp %[step1_30], $ac3, 31 \n\t" 149 "add %[step1_16], %[temp0], %[temp1] \n\t" 150 "add %[step1_31], %[temp2], %[temp3] \n\t" 151 152 : [load1] "=&r" (load1), [load2] "=&r" (load2), 153 [load3] "=&r" (load3), [load4] "=&r" (load4), 154 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 155 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 156 [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), 157 [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) 158 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 159 [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), 160 [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), 161 [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) 162 ); 163 164 __asm__ __volatile__ ( 165 "lh %[load1], 18(%[input]) \n\t" 166 "lh %[load2], 46(%[input]) \n\t" 167 "lh %[load3], 50(%[input]) \n\t" 168 "lh %[load4], 14(%[input]) \n\t" 169 170 "mtlo %[const_2_power_13], $ac1 \n\t" 171 "mthi $zero, $ac1 \n\t" 172 "mtlo %[const_2_power_13], $ac3 \n\t" 173 "mthi $zero, $ac3 \n\t" 174 175 "madd $ac1, %[load1], %[cospi_23_64] \n\t" 176 "msub $ac1, %[load2], %[cospi_9_64] \n\t" 177 "extp %[temp0], $ac1, 31 \n\t" 178 179 "madd $ac3, %[load1], %[cospi_9_64] \n\t" 180 "madd $ac3, %[load2], %[cospi_23_64] \n\t" 181 "extp %[temp3], $ac3, 31 \n\t" 182 183 "mtlo %[const_2_power_13], $ac1 \n\t" 184 "mthi $zero, $ac1 \n\t" 185 "mtlo %[const_2_power_13], $ac2 \n\t" 186 "mthi $zero, $ac2 \n\t" 187 188 "madd $ac2, %[load3], %[cospi_7_64] \n\t" 189 "msub $ac2, %[load4], %[cospi_25_64] \n\t" 190 "extp %[temp1], $ac2, 31 \n\t" 191 192 "madd $ac1, %[load3], %[cospi_25_64] \n\t" 193 "madd $ac1, %[load4], %[cospi_7_64] \n\t" 194 "extp %[temp2], $ac1, 31 \n\t" 195 196 "mtlo %[const_2_power_13], $ac1 \n\t" 197 "mthi $zero, $ac1 \n\t" 198 "mtlo %[const_2_power_13], $ac3 \n\t" 199 "mthi $zero, $ac3 \n\t" 200 201 "sub %[load1], %[temp1], %[temp0] \n\t" 202 "sub %[load2], %[temp2], %[temp3] \n\t" 203 204 "msub $ac1, %[load1], %[cospi_28_64] \n\t" 205 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 206 "msub $ac3, %[load1], %[cospi_4_64] \n\t" 207 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 208 209 "extp %[step1_18], $ac1, 31 \n\t" 210 "extp %[step1_29], $ac3, 31 \n\t" 211 "add %[step1_19], %[temp0], %[temp1] \n\t" 212 "add %[step1_28], %[temp2], %[temp3] \n\t" 213 214 : [load1] "=&r" (load1), [load2] "=&r" (load2), 215 [load3] "=&r" (load3), [load4] "=&r" (load4), 216 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 217 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 218 [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), 219 [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) 220 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 221 [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), 222 [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), 223 [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) 224 ); 225 226 __asm__ __volatile__ ( 227 "lh %[load1], 10(%[input]) \n\t" 228 "lh %[load2], 54(%[input]) \n\t" 229 "lh %[load3], 42(%[input]) \n\t" 230 "lh %[load4], 22(%[input]) \n\t" 231 232 "mtlo %[const_2_power_13], $ac1 \n\t" 233 "mthi $zero, $ac1 \n\t" 234 "mtlo %[const_2_power_13], $ac3 \n\t" 235 "mthi $zero, $ac3 \n\t" 236 237 "madd $ac1, %[load1], %[cospi_27_64] \n\t" 238 "msub $ac1, %[load2], %[cospi_5_64] \n\t" 239 "extp %[temp0], $ac1, 31 \n\t" 240 241 "madd $ac3, %[load1], %[cospi_5_64] \n\t" 242 "madd $ac3, %[load2], %[cospi_27_64] \n\t" 243 "extp %[temp3], $ac3, 31 \n\t" 244 245 "mtlo %[const_2_power_13], $ac1 \n\t" 246 "mthi $zero, $ac1 \n\t" 247 "mtlo %[const_2_power_13], $ac2 \n\t" 248 "mthi $zero, $ac2 \n\t" 249 250 "madd $ac2, %[load3], %[cospi_11_64] \n\t" 251 "msub $ac2, %[load4], %[cospi_21_64] \n\t" 252 "extp %[temp1], $ac2, 31 \n\t" 253 254 "madd $ac1, %[load3], %[cospi_21_64] \n\t" 255 "madd $ac1, %[load4], %[cospi_11_64] \n\t" 256 "extp %[temp2], $ac1, 31 \n\t" 257 258 "mtlo %[const_2_power_13], $ac1 \n\t" 259 "mthi $zero, $ac1 \n\t" 260 "mtlo %[const_2_power_13], $ac3 \n\t" 261 "mthi $zero, $ac3 \n\t" 262 263 "sub %[load1], %[temp0], %[temp1] \n\t" 264 "sub %[load2], %[temp3], %[temp2] \n\t" 265 266 "madd $ac1, %[load2], %[cospi_12_64] \n\t" 267 "msub $ac1, %[load1], %[cospi_20_64] \n\t" 268 "madd $ac3, %[load1], %[cospi_12_64] \n\t" 269 "madd $ac3, %[load2], %[cospi_20_64] \n\t" 270 271 "extp %[step1_21], $ac1, 31 \n\t" 272 "extp %[step1_26], $ac3, 31 \n\t" 273 "add %[step1_20], %[temp0], %[temp1] \n\t" 274 "add %[step1_27], %[temp2], %[temp3] \n\t" 275 276 : [load1] "=&r" (load1), [load2] "=&r" (load2), 277 [load3] "=&r" (load3), [load4] "=&r" (load4), 278 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 279 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 280 [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), 281 [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) 282 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 283 [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), 284 [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), 285 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) 286 ); 287 288 __asm__ __volatile__ ( 289 "lh %[load1], 26(%[input]) \n\t" 290 "lh %[load2], 38(%[input]) \n\t" 291 "lh %[load3], 58(%[input]) \n\t" 292 "lh %[load4], 6(%[input]) \n\t" 293 294 "mtlo %[const_2_power_13], $ac1 \n\t" 295 "mthi $zero, $ac1 \n\t" 296 "mtlo %[const_2_power_13], $ac3 \n\t" 297 "mthi $zero, $ac3 \n\t" 298 299 "madd $ac1, %[load1], %[cospi_19_64] \n\t" 300 "msub $ac1, %[load2], %[cospi_13_64] \n\t" 301 "extp %[temp0], $ac1, 31 \n\t" 302 303 "madd $ac3, %[load1], %[cospi_13_64] \n\t" 304 "madd $ac3, %[load2], %[cospi_19_64] \n\t" 305 "extp %[temp3], $ac3, 31 \n\t" 306 307 "mtlo %[const_2_power_13], $ac1 \n\t" 308 "mthi $zero, $ac1 \n\t" 309 "mtlo %[const_2_power_13], $ac2 \n\t" 310 "mthi $zero, $ac2 \n\t" 311 312 "madd $ac2, %[load3], %[cospi_3_64] \n\t" 313 "msub $ac2, %[load4], %[cospi_29_64] \n\t" 314 "extp %[temp1], $ac2, 31 \n\t" 315 316 "madd $ac1, %[load3], %[cospi_29_64] \n\t" 317 "madd $ac1, %[load4], %[cospi_3_64] \n\t" 318 "extp %[temp2], $ac1, 31 \n\t" 319 320 "mtlo %[const_2_power_13], $ac1 \n\t" 321 "mthi $zero, $ac1 \n\t" 322 "mtlo %[const_2_power_13], $ac3 \n\t" 323 "mthi $zero, $ac3 \n\t" 324 325 "sub %[load1], %[temp1], %[temp0] \n\t" 326 "sub %[load2], %[temp2], %[temp3] \n\t" 327 328 "msub $ac1, %[load1], %[cospi_12_64] \n\t" 329 "msub $ac1, %[load2], %[cospi_20_64] \n\t" 330 "msub $ac3, %[load1], %[cospi_20_64] \n\t" 331 "madd $ac3, %[load2], %[cospi_12_64] \n\t" 332 333 "extp %[step1_22], $ac1, 31 \n\t" 334 "extp %[step1_25], $ac3, 31 \n\t" 335 "add %[step1_23], %[temp0], %[temp1] \n\t" 336 "add %[step1_24], %[temp2], %[temp3] \n\t" 337 338 : [load1] "=&r" (load1), [load2] "=&r" (load2), 339 [load3] "=&r" (load3), [load4] "=&r" (load4), 340 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 341 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 342 [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), 343 [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) 344 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 345 [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), 346 [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), 347 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) 348 ); 349 350 __asm__ __volatile__ ( 351 "lh %[load1], 4(%[input]) \n\t" 352 "lh %[load2], 60(%[input]) \n\t" 353 "lh %[load3], 36(%[input]) \n\t" 354 "lh %[load4], 28(%[input]) \n\t" 355 356 "mtlo %[const_2_power_13], $ac1 \n\t" 357 "mthi $zero, $ac1 \n\t" 358 "mtlo %[const_2_power_13], $ac3 \n\t" 359 "mthi $zero, $ac3 \n\t" 360 361 "madd $ac1, %[load1], %[cospi_30_64] \n\t" 362 "msub $ac1, %[load2], %[cospi_2_64] \n\t" 363 "extp %[temp0], $ac1, 31 \n\t" 364 365 "madd $ac3, %[load1], %[cospi_2_64] \n\t" 366 "madd $ac3, %[load2], %[cospi_30_64] \n\t" 367 "extp %[temp3], $ac3, 31 \n\t" 368 369 "mtlo %[const_2_power_13], $ac1 \n\t" 370 "mthi $zero, $ac1 \n\t" 371 "mtlo %[const_2_power_13], $ac2 \n\t" 372 "mthi $zero, $ac2 \n\t" 373 374 "madd $ac2, %[load3], %[cospi_14_64] \n\t" 375 "msub $ac2, %[load4], %[cospi_18_64] \n\t" 376 "extp %[temp1], $ac2, 31 \n\t" 377 378 "madd $ac1, %[load3], %[cospi_18_64] \n\t" 379 "madd $ac1, %[load4], %[cospi_14_64] \n\t" 380 "extp %[temp2], $ac1, 31 \n\t" 381 382 "mtlo %[const_2_power_13], $ac1 \n\t" 383 "mthi $zero, $ac1 \n\t" 384 "mtlo %[const_2_power_13], $ac3 \n\t" 385 "mthi $zero, $ac3 \n\t" 386 387 "sub %[load1], %[temp0], %[temp1] \n\t" 388 "sub %[load2], %[temp3], %[temp2] \n\t" 389 390 "msub $ac1, %[load1], %[cospi_8_64] \n\t" 391 "madd $ac1, %[load2], %[cospi_24_64] \n\t" 392 "madd $ac3, %[load1], %[cospi_24_64] \n\t" 393 "madd $ac3, %[load2], %[cospi_8_64] \n\t" 394 395 "extp %[step2_9], $ac1, 31 \n\t" 396 "extp %[step2_14], $ac3, 31 \n\t" 397 "add %[step2_8], %[temp0], %[temp1] \n\t" 398 "add %[step2_15], %[temp2], %[temp3] \n\t" 399 400 : [load1] "=&r" (load1), [load2] "=&r" (load2), 401 [load3] "=&r" (load3), [load4] "=&r" (load4), 402 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 403 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 404 [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), 405 [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) 406 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 407 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), 408 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), 409 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) 410 ); 411 412 __asm__ __volatile__ ( 413 "lh %[load1], 20(%[input]) \n\t" 414 "lh %[load2], 44(%[input]) \n\t" 415 "lh %[load3], 52(%[input]) \n\t" 416 "lh %[load4], 12(%[input]) \n\t" 417 418 "mtlo %[const_2_power_13], $ac1 \n\t" 419 "mthi $zero, $ac1 \n\t" 420 "mtlo %[const_2_power_13], $ac3 \n\t" 421 "mthi $zero, $ac3 \n\t" 422 423 "madd $ac1, %[load1], %[cospi_22_64] \n\t" 424 "msub $ac1, %[load2], %[cospi_10_64] \n\t" 425 "extp %[temp0], $ac1, 31 \n\t" 426 427 "madd $ac3, %[load1], %[cospi_10_64] \n\t" 428 "madd $ac3, %[load2], %[cospi_22_64] \n\t" 429 "extp %[temp3], $ac3, 31 \n\t" 430 431 "mtlo %[const_2_power_13], $ac1 \n\t" 432 "mthi $zero, $ac1 \n\t" 433 "mtlo %[const_2_power_13], $ac2 \n\t" 434 "mthi $zero, $ac2 \n\t" 435 436 "madd $ac2, %[load3], %[cospi_6_64] \n\t" 437 "msub $ac2, %[load4], %[cospi_26_64] \n\t" 438 "extp %[temp1], $ac2, 31 \n\t" 439 440 "madd $ac1, %[load3], %[cospi_26_64] \n\t" 441 "madd $ac1, %[load4], %[cospi_6_64] \n\t" 442 "extp %[temp2], $ac1, 31 \n\t" 443 444 "mtlo %[const_2_power_13], $ac1 \n\t" 445 "mthi $zero, $ac1 \n\t" 446 "mtlo %[const_2_power_13], $ac3 \n\t" 447 "mthi $zero, $ac3 \n\t" 448 449 "sub %[load1], %[temp1], %[temp0] \n\t" 450 "sub %[load2], %[temp2], %[temp3] \n\t" 451 452 "msub $ac1, %[load1], %[cospi_24_64] \n\t" 453 "msub $ac1, %[load2], %[cospi_8_64] \n\t" 454 "madd $ac3, %[load2], %[cospi_24_64] \n\t" 455 "msub $ac3, %[load1], %[cospi_8_64] \n\t" 456 457 "extp %[step2_10], $ac1, 31 \n\t" 458 "extp %[step2_13], $ac3, 31 \n\t" 459 "add %[step2_11], %[temp0], %[temp1] \n\t" 460 "add %[step2_12], %[temp2], %[temp3] \n\t" 461 462 : [load1] "=&r" (load1), [load2] "=&r" (load2), 463 [load3] "=&r" (load3), [load4] "=&r" (load4), 464 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 465 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 466 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), 467 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) 468 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 469 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), 470 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), 471 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) 472 ); 473 474 __asm__ __volatile__ ( 475 "mtlo %[const_2_power_13], $ac0 \n\t" 476 "mthi $zero, $ac0 \n\t" 477 "sub %[temp0], %[step2_14], %[step2_13] \n\t" 478 "sub %[temp0], %[temp0], %[step2_9] \n\t" 479 "add %[temp0], %[temp0], %[step2_10] \n\t" 480 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 481 482 "mtlo %[const_2_power_13], $ac1 \n\t" 483 "mthi $zero, $ac1 \n\t" 484 "sub %[temp1], %[step2_14], %[step2_13] \n\t" 485 "add %[temp1], %[temp1], %[step2_9] \n\t" 486 "sub %[temp1], %[temp1], %[step2_10] \n\t" 487 "madd $ac1, %[temp1], %[cospi_16_64] \n\t" 488 489 "mtlo %[const_2_power_13], $ac2 \n\t" 490 "mthi $zero, $ac2 \n\t" 491 "sub %[temp0], %[step2_15], %[step2_12] \n\t" 492 "sub %[temp0], %[temp0], %[step2_8] \n\t" 493 "add %[temp0], %[temp0], %[step2_11] \n\t" 494 "madd $ac2, %[temp0], %[cospi_16_64] \n\t" 495 496 "mtlo %[const_2_power_13], $ac3 \n\t" 497 "mthi $zero, $ac3 \n\t" 498 "sub %[temp1], %[step2_15], %[step2_12] \n\t" 499 "add %[temp1], %[temp1], %[step2_8] \n\t" 500 "sub %[temp1], %[temp1], %[step2_11] \n\t" 501 "madd $ac3, %[temp1], %[cospi_16_64] \n\t" 502 503 "add %[step3_8], %[step2_8], %[step2_11] \n\t" 504 "add %[step3_9], %[step2_9], %[step2_10] \n\t" 505 "add %[step3_14], %[step2_13], %[step2_14] \n\t" 506 "add %[step3_15], %[step2_12], %[step2_15] \n\t" 507 508 "extp %[step3_10], $ac0, 31 \n\t" 509 "extp %[step3_13], $ac1, 31 \n\t" 510 "extp %[step3_11], $ac2, 31 \n\t" 511 "extp %[step3_12], $ac3, 31 \n\t" 512 513 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 514 [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), 515 [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), 516 [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), 517 [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) 518 : [const_2_power_13] "r" (const_2_power_13), 519 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), 520 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), 521 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), 522 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15), 523 [cospi_16_64] "r" (cospi_16_64) 524 ); 525 526 step2_18 = step1_17 - step1_18; 527 step2_29 = step1_30 - step1_29; 528 529 __asm__ __volatile__ ( 530 "mtlo %[const_2_power_13], $ac0 \n\t" 531 "mthi $zero, $ac0 \n\t" 532 "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" 533 "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" 534 "extp %[step3_18], $ac0, 31 \n\t" 535 536 : [step3_18] "=r" (step3_18) 537 : [const_2_power_13] "r" (const_2_power_13), 538 [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), 539 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 540 ); 541 542 temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; 543 step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 544 545 step2_19 = step1_16 - step1_19; 546 step2_28 = step1_31 - step1_28; 547 548 __asm__ __volatile__ ( 549 "mtlo %[const_2_power_13], $ac0 \n\t" 550 "mthi $zero, $ac0 \n\t" 551 "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" 552 "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" 553 "extp %[step3_19], $ac0, 31 \n\t" 554 555 : [step3_19] "=r" (step3_19) 556 : [const_2_power_13] "r" (const_2_power_13), 557 [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), 558 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 559 ); 560 561 temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; 562 step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 563 564 step3_16 = step1_16 + step1_19; 565 step3_17 = step1_17 + step1_18; 566 step3_30 = step1_29 + step1_30; 567 step3_31 = step1_28 + step1_31; 568 569 step2_20 = step1_23 - step1_20; 570 step2_27 = step1_24 - step1_27; 571 572 __asm__ __volatile__ ( 573 "mtlo %[const_2_power_13], $ac0 \n\t" 574 "mthi $zero, $ac0 \n\t" 575 "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" 576 "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" 577 "extp %[step3_20], $ac0, 31 \n\t" 578 579 : [step3_20] "=r" (step3_20) 580 : [const_2_power_13] "r" (const_2_power_13), 581 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), 582 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 583 ); 584 585 temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; 586 step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 587 588 step2_21 = step1_22 - step1_21; 589 step2_26 = step1_25 - step1_26; 590 591 __asm__ __volatile__ ( 592 "mtlo %[const_2_power_13], $ac1 \n\t" 593 "mthi $zero, $ac1 \n\t" 594 "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" 595 "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" 596 "extp %[step3_21], $ac1, 31 \n\t" 597 598 : [step3_21] "=r" (step3_21) 599 : [const_2_power_13] "r" (const_2_power_13), 600 [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), 601 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 602 ); 603 604 temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; 605 step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 606 607 step3_22 = step1_21 + step1_22; 608 step3_23 = step1_20 + step1_23; 609 step3_24 = step1_24 + step1_27; 610 step3_25 = step1_25 + step1_26; 611 612 step2_16 = step3_16 + step3_23; 613 step2_17 = step3_17 + step3_22; 614 step2_18 = step3_18 + step3_21; 615 step2_19 = step3_19 + step3_20; 616 step2_20 = step3_19 - step3_20; 617 step2_21 = step3_18 - step3_21; 618 step2_22 = step3_17 - step3_22; 619 step2_23 = step3_16 - step3_23; 620 621 step2_24 = step3_31 - step3_24; 622 step2_25 = step3_30 - step3_25; 623 step2_26 = step3_29 - step3_26; 624 step2_27 = step3_28 - step3_27; 625 step2_28 = step3_28 + step3_27; 626 step2_29 = step3_29 + step3_26; 627 step2_30 = step3_30 + step3_25; 628 step2_31 = step3_31 + step3_24; 629 630 __asm__ __volatile__ ( 631 "lh %[load1], 0(%[input]) \n\t" 632 "lh %[load2], 32(%[input]) \n\t" 633 "lh %[load3], 16(%[input]) \n\t" 634 "lh %[load4], 48(%[input]) \n\t" 635 636 "mtlo %[const_2_power_13], $ac1 \n\t" 637 "mthi $zero, $ac1 \n\t" 638 "mtlo %[const_2_power_13], $ac2 \n\t" 639 "mthi $zero, $ac2 \n\t" 640 "add %[result1], %[load1], %[load2] \n\t" 641 "sub %[result2], %[load1], %[load2] \n\t" 642 "madd $ac1, %[result1], %[cospi_16_64] \n\t" 643 "madd $ac2, %[result2], %[cospi_16_64] \n\t" 644 "extp %[temp0], $ac1, 31 \n\t" 645 "extp %[temp1], $ac2, 31 \n\t" 646 647 "mtlo %[const_2_power_13], $ac3 \n\t" 648 "mthi $zero, $ac3 \n\t" 649 "madd $ac3, %[load3], %[cospi_24_64] \n\t" 650 "msub $ac3, %[load4], %[cospi_8_64] \n\t" 651 "extp %[temp2], $ac3, 31 \n\t" 652 653 "mtlo %[const_2_power_13], $ac1 \n\t" 654 "mthi $zero, $ac1 \n\t" 655 "madd $ac1, %[load3], %[cospi_8_64] \n\t" 656 "madd $ac1, %[load4], %[cospi_24_64] \n\t" 657 "extp %[temp3], $ac1, 31 \n\t" 658 659 "add %[step1_0], %[temp0], %[temp3] \n\t" 660 "add %[step1_1], %[temp1], %[temp2] \n\t" 661 "sub %[step1_2], %[temp1], %[temp2] \n\t" 662 "sub %[step1_3], %[temp0], %[temp3] \n\t" 663 664 : [load1] "=&r" (load1), [load2] "=&r" (load2), 665 [load3] "=&r" (load3), [load4] "=&r" (load4), 666 [result1] "=&r" (result1), [result2] "=&r" (result2), 667 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 668 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 669 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), 670 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) 671 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 672 [cospi_16_64] "r" (cospi_16_64), 673 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) 674 675 ); 676 677 __asm__ __volatile__ ( 678 "lh %[load1], 8(%[input]) \n\t" 679 "lh %[load2], 56(%[input]) \n\t" 680 "lh %[load3], 40(%[input]) \n\t" 681 "lh %[load4], 24(%[input]) \n\t" 682 683 "mtlo %[const_2_power_13], $ac1 \n\t" 684 "mthi $zero, $ac1 \n\t" 685 "mtlo %[const_2_power_13], $ac3 \n\t" 686 "mthi $zero, $ac3 \n\t" 687 688 "madd $ac1, %[load1], %[cospi_28_64] \n\t" 689 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 690 "extp %[temp0], $ac1, 31 \n\t" 691 692 "madd $ac3, %[load1], %[cospi_4_64] \n\t" 693 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 694 "extp %[temp3], $ac3, 31 \n\t" 695 696 "mtlo %[const_2_power_13], $ac1 \n\t" 697 "mthi $zero, $ac1 \n\t" 698 "mtlo %[const_2_power_13], $ac2 \n\t" 699 "mthi $zero, $ac2 \n\t" 700 701 "madd $ac2, %[load3], %[cospi_12_64] \n\t" 702 "msub $ac2, %[load4], %[cospi_20_64] \n\t" 703 "extp %[temp1], $ac2, 31 \n\t" 704 705 "madd $ac1, %[load3], %[cospi_20_64] \n\t" 706 "madd $ac1, %[load4], %[cospi_12_64] \n\t" 707 "extp %[temp2], $ac1, 31 \n\t" 708 709 "mtlo %[const_2_power_13], $ac1 \n\t" 710 "mthi $zero, $ac1 \n\t" 711 "mtlo %[const_2_power_13], $ac3 \n\t" 712 "mthi $zero, $ac3 \n\t" 713 714 "sub %[load1], %[temp3], %[temp2] \n\t" 715 "sub %[load1], %[load1], %[temp0] \n\t" 716 "add %[load1], %[load1], %[temp1] \n\t" 717 718 "sub %[load2], %[temp0], %[temp1] \n\t" 719 "sub %[load2], %[load2], %[temp2] \n\t" 720 "add %[load2], %[load2], %[temp3] \n\t" 721 722 "madd $ac1, %[load1], %[cospi_16_64] \n\t" 723 "madd $ac3, %[load2], %[cospi_16_64] \n\t" 724 725 "extp %[step1_5], $ac1, 31 \n\t" 726 "extp %[step1_6], $ac3, 31 \n\t" 727 "add %[step1_4], %[temp0], %[temp1] \n\t" 728 "add %[step1_7], %[temp3], %[temp2] \n\t" 729 730 : [load1] "=&r" (load1), [load2] "=&r" (load2), 731 [load3] "=&r" (load3), [load4] "=&r" (load4), 732 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), 733 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), 734 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), 735 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) 736 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), 737 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), 738 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), 739 [cospi_16_64] "r" (cospi_16_64) 740 ); 741 742 step2_0 = step1_0 + step1_7; 743 step2_1 = step1_1 + step1_6; 744 step2_2 = step1_2 + step1_5; 745 step2_3 = step1_3 + step1_4; 746 step2_4 = step1_3 - step1_4; 747 step2_5 = step1_2 - step1_5; 748 step2_6 = step1_1 - step1_6; 749 step2_7 = step1_0 - step1_7; 750 751 step1_0 = step2_0 + step3_15; 752 step1_1 = step2_1 + step3_14; 753 step1_2 = step2_2 + step3_13; 754 step1_3 = step2_3 + step3_12; 755 step1_4 = step2_4 + step3_11; 756 step1_5 = step2_5 + step3_10; 757 step1_6 = step2_6 + step3_9; 758 step1_7 = step2_7 + step3_8; 759 step1_8 = step2_7 - step3_8; 760 step1_9 = step2_6 - step3_9; 761 step1_10 = step2_5 - step3_10; 762 step1_11 = step2_4 - step3_11; 763 step1_12 = step2_3 - step3_12; 764 step1_13 = step2_2 - step3_13; 765 step1_14 = step2_1 - step3_14; 766 step1_15 = step2_0 - step3_15; 767 768 __asm__ __volatile__ ( 769 "sub %[temp0], %[step2_27], %[step2_20] \n\t" 770 "mtlo %[const_2_power_13], $ac0 \n\t" 771 "mthi $zero, $ac0 \n\t" 772 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 773 "extp %[step1_20], $ac0, 31 \n\t" 774 775 : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) 776 : [const_2_power_13] "r" (const_2_power_13), 777 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), 778 [cospi_16_64] "r" (cospi_16_64) 779 ); 780 781 temp21 = (step2_20 + step2_27) * cospi_16_64; 782 step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 783 784 __asm__ __volatile__ ( 785 "sub %[temp0], %[step2_26], %[step2_21] \n\t" 786 "mtlo %[const_2_power_13], $ac0 \n\t" 787 "mthi $zero, $ac0 \n\t" 788 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 789 "extp %[step1_21], $ac0, 31 \n\t" 790 791 : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) 792 : [const_2_power_13] "r" (const_2_power_13), 793 [step2_26] "r" (step2_26), [step2_21] "r" (step2_21), 794 [cospi_16_64] "r" (cospi_16_64) 795 ); 796 797 temp21 = (step2_21 + step2_26) * cospi_16_64; 798 step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 799 800 __asm__ __volatile__ ( 801 "sub %[temp0], %[step2_25], %[step2_22] \n\t" 802 "mtlo %[const_2_power_13], $ac0 \n\t" 803 "mthi $zero, $ac0 \n\t" 804 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 805 "extp %[step1_22], $ac0, 31 \n\t" 806 807 : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) 808 : [const_2_power_13] "r" (const_2_power_13), 809 [step2_25] "r" (step2_25), [step2_22] "r" (step2_22), 810 [cospi_16_64] "r" (cospi_16_64) 811 ); 812 813 temp21 = (step2_22 + step2_25) * cospi_16_64; 814 step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 815 816 __asm__ __volatile__ ( 817 "sub %[temp0], %[step2_24], %[step2_23] \n\t" 818 "mtlo %[const_2_power_13], $ac0 \n\t" 819 "mthi $zero, $ac0 \n\t" 820 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 821 "extp %[step1_23], $ac0, 31 \n\t" 822 823 : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) 824 : [const_2_power_13] "r" (const_2_power_13), 825 [step2_24] "r" (step2_24), [step2_23] "r" (step2_23), 826 [cospi_16_64] "r" (cospi_16_64) 827 ); 828 829 temp21 = (step2_23 + step2_24) * cospi_16_64; 830 step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; 831 832 // final stage 833 output[0 * 32] = step1_0 + step2_31; 834 output[1 * 32] = step1_1 + step2_30; 835 output[2 * 32] = step1_2 + step2_29; 836 output[3 * 32] = step1_3 + step2_28; 837 output[4 * 32] = step1_4 + step1_27; 838 output[5 * 32] = step1_5 + step1_26; 839 output[6 * 32] = step1_6 + step1_25; 840 output[7 * 32] = step1_7 + step1_24; 841 output[8 * 32] = step1_8 + step1_23; 842 output[9 * 32] = step1_9 + step1_22; 843 output[10 * 32] = step1_10 + step1_21; 844 output[11 * 32] = step1_11 + step1_20; 845 output[12 * 32] = step1_12 + step2_19; 846 output[13 * 32] = step1_13 + step2_18; 847 output[14 * 32] = step1_14 + step2_17; 848 output[15 * 32] = step1_15 + step2_16; 849 output[16 * 32] = step1_15 - step2_16; 850 output[17 * 32] = step1_14 - step2_17; 851 output[18 * 32] = step1_13 - step2_18; 852 output[19 * 32] = step1_12 - step2_19; 853 output[20 * 32] = step1_11 - step1_20; 854 output[21 * 32] = step1_10 - step1_21; 855 output[22 * 32] = step1_9 - step1_22; 856 output[23 * 32] = step1_8 - step1_23; 857 output[24 * 32] = step1_7 - step1_24; 858 output[25 * 32] = step1_6 - step1_25; 859 output[26 * 32] = step1_5 - step1_26; 860 output[27 * 32] = step1_4 - step1_27; 861 output[28 * 32] = step1_3 - step2_28; 862 output[29 * 32] = step1_2 - step2_29; 863 output[30 * 32] = step1_1 - step2_30; 864 output[31 * 32] = step1_0 - step2_31; 865 866 input += 32; 867 output += 1; 868 } 869} 870 871void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, 872 int dest_stride) { 873 DECLARE_ALIGNED(32, int16_t, out[32 * 32]); 874 int16_t *outptr = out; 875 uint32_t pos = 45; 876 877 /* bit positon for extract from acc */ 878 __asm__ __volatile__ ( 879 "wrdsp %[pos], 1 \n\t" 880 : 881 : [pos] "r" (pos) 882 ); 883 884 // Rows 885 idct32_rows_dspr2(input, outptr, 32); 886 887 // Columns 888 vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride); 889} 890 891void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, 892 int stride) { 893 DECLARE_ALIGNED(32, int16_t, out[32 * 32]); 894 int16_t *outptr = out; 895 uint32_t i; 896 uint32_t pos = 45; 897 898 /* bit positon for extract from acc */ 899 __asm__ __volatile__ ( 900 "wrdsp %[pos], 1 \n\t" 901 : 902 : [pos] "r" (pos) 903 ); 904 905 // Rows 906 idct32_rows_dspr2(input, outptr, 8); 907 908 outptr += 8; 909 __asm__ __volatile__ ( 910 "sw $zero, 0(%[outptr]) \n\t" 911 "sw $zero, 4(%[outptr]) \n\t" 912 "sw $zero, 8(%[outptr]) \n\t" 913 "sw $zero, 12(%[outptr]) \n\t" 914 "sw $zero, 16(%[outptr]) \n\t" 915 "sw $zero, 20(%[outptr]) \n\t" 916 "sw $zero, 24(%[outptr]) \n\t" 917 "sw $zero, 28(%[outptr]) \n\t" 918 "sw $zero, 32(%[outptr]) \n\t" 919 "sw $zero, 36(%[outptr]) \n\t" 920 "sw $zero, 40(%[outptr]) \n\t" 921 "sw $zero, 44(%[outptr]) \n\t" 922 923 : 924 : [outptr] "r" (outptr) 925 ); 926 927 for (i = 0; i < 31; ++i) { 928 outptr += 32; 929 930 __asm__ __volatile__ ( 931 "sw $zero, 0(%[outptr]) \n\t" 932 "sw $zero, 4(%[outptr]) \n\t" 933 "sw $zero, 8(%[outptr]) \n\t" 934 "sw $zero, 12(%[outptr]) \n\t" 935 "sw $zero, 16(%[outptr]) \n\t" 936 "sw $zero, 20(%[outptr]) \n\t" 937 "sw $zero, 24(%[outptr]) \n\t" 938 "sw $zero, 28(%[outptr]) \n\t" 939 "sw $zero, 32(%[outptr]) \n\t" 940 "sw $zero, 36(%[outptr]) \n\t" 941 "sw $zero, 40(%[outptr]) \n\t" 942 "sw $zero, 44(%[outptr]) \n\t" 943 944 : 945 : [outptr] "r" (outptr) 946 ); 947 } 948 949 // Columns 950 vp9_idct32_cols_add_blk_dspr2(out, dest, stride); 951} 952 953void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, 954 int stride) { 955 int r, out; 956 int32_t a1, absa1; 957 int32_t vector_a1; 958 int32_t t1, t2, t3, t4; 959 int32_t vector_1, vector_2, vector_3, vector_4; 960 uint32_t pos = 45; 961 962 /* bit positon for extract from acc */ 963 __asm__ __volatile__ ( 964 "wrdsp %[pos], 1 \n\t" 965 966 : 967 : [pos] "r" (pos) 968 ); 969 970 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); 971 __asm__ __volatile__ ( 972 "addi %[out], %[out], 32 \n\t" 973 "sra %[a1], %[out], 6 \n\t" 974 975 : [out] "+r" (out), [a1] "=r" (a1) 976 : 977 ); 978 979 if (a1 < 0) { 980 /* use quad-byte 981 * input and output memory are four byte aligned */ 982 __asm__ __volatile__ ( 983 "abs %[absa1], %[a1] \n\t" 984 "replv.qb %[vector_a1], %[absa1] \n\t" 985 986 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) 987 : [a1] "r" (a1) 988 ); 989 990 for (r = 32; r--;) { 991 __asm__ __volatile__ ( 992 "lw %[t1], 0(%[dest]) \n\t" 993 "lw %[t2], 4(%[dest]) \n\t" 994 "lw %[t3], 8(%[dest]) \n\t" 995 "lw %[t4], 12(%[dest]) \n\t" 996 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 997 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 998 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 999 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1000 "sw %[vector_1], 0(%[dest]) \n\t" 1001 "sw %[vector_2], 4(%[dest]) \n\t" 1002 "sw %[vector_3], 8(%[dest]) \n\t" 1003 "sw %[vector_4], 12(%[dest]) \n\t" 1004 1005 "lw %[t1], 16(%[dest]) \n\t" 1006 "lw %[t2], 20(%[dest]) \n\t" 1007 "lw %[t3], 24(%[dest]) \n\t" 1008 "lw %[t4], 28(%[dest]) \n\t" 1009 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 1010 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 1011 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 1012 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1013 "sw %[vector_1], 16(%[dest]) \n\t" 1014 "sw %[vector_2], 20(%[dest]) \n\t" 1015 "sw %[vector_3], 24(%[dest]) \n\t" 1016 "sw %[vector_4], 28(%[dest]) \n\t" 1017 1018 "add %[dest], %[dest], %[stride] \n\t" 1019 1020 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), 1021 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), 1022 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), 1023 [dest] "+&r" (dest) 1024 : [stride] "r" (stride), [vector_a1] "r" (vector_a1) 1025 ); 1026 } 1027 } else { 1028 /* use quad-byte 1029 * input and output memory are four byte aligned */ 1030 __asm__ __volatile__ ( 1031 "replv.qb %[vector_a1], %[a1] \n\t" 1032 1033 : [vector_a1] "=r" (vector_a1) 1034 : [a1] "r" (a1) 1035 ); 1036 1037 for (r = 32; r--;) { 1038 __asm__ __volatile__ ( 1039 "lw %[t1], 0(%[dest]) \n\t" 1040 "lw %[t2], 4(%[dest]) \n\t" 1041 "lw %[t3], 8(%[dest]) \n\t" 1042 "lw %[t4], 12(%[dest]) \n\t" 1043 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 1044 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 1045 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 1046 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1047 "sw %[vector_1], 0(%[dest]) \n\t" 1048 "sw %[vector_2], 4(%[dest]) \n\t" 1049 "sw %[vector_3], 8(%[dest]) \n\t" 1050 "sw %[vector_4], 12(%[dest]) \n\t" 1051 1052 "lw %[t1], 16(%[dest]) \n\t" 1053 "lw %[t2], 20(%[dest]) \n\t" 1054 "lw %[t3], 24(%[dest]) \n\t" 1055 "lw %[t4], 28(%[dest]) \n\t" 1056 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 1057 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 1058 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 1059 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1060 "sw %[vector_1], 16(%[dest]) \n\t" 1061 "sw %[vector_2], 20(%[dest]) \n\t" 1062 "sw %[vector_3], 24(%[dest]) \n\t" 1063 "sw %[vector_4], 28(%[dest]) \n\t" 1064 1065 "add %[dest], %[dest], %[stride] \n\t" 1066 1067 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), 1068 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), 1069 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), 1070 [dest] "+&r" (dest) 1071 : [stride] "r" (stride), [vector_a1] "r" (vector_a1) 1072 ); 1073 } 1074 } 1075} 1076#endif // #if HAVE_DSPR2 1077