1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_config.h" 12#include "./vpx_dsp_rtcd.h" 13#include "vpx_dsp/mips/inv_txfm_dspr2.h" 14#include "vpx_dsp/txfm_common.h" 15 16#if HAVE_DSPR2 17void idct16_rows_dspr2(const int16_t *input, int16_t *output, 18 uint32_t no_rows) { 19 int i; 20 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; 21 int step1_10, step1_11, step1_12, step1_13; 22 int step2_0, step2_1, step2_2, step2_3; 23 int step2_8, step2_9, step2_10, step2_11; 24 int step2_12, step2_13, step2_14, step2_15; 25 int load1, load2, load3, load4, load5, load6, load7, load8; 26 int result1, result2, result3, result4; 27 const int const_2_power_13 = 8192; 28 29 for (i = no_rows; i--;) { 30 /* prefetch row */ 31 prefetch_load((const uint8_t *)(input + 16)); 32 33 __asm__ __volatile__( 34 "lh %[load1], 0(%[input]) \n\t" 35 "lh %[load2], 16(%[input]) \n\t" 36 "lh %[load3], 8(%[input]) \n\t" 37 "lh %[load4], 24(%[input]) \n\t" 38 39 "mtlo %[const_2_power_13], $ac1 \n\t" 40 "mthi $zero, $ac1 \n\t" 41 "mtlo %[const_2_power_13], $ac2 \n\t" 42 "mthi $zero, $ac2 \n\t" 43 "add %[result1], %[load1], %[load2] \n\t" 44 "sub %[result2], %[load1], %[load2] \n\t" 45 "madd $ac1, %[result1], %[cospi_16_64] \n\t" 46 "madd $ac2, %[result2], %[cospi_16_64] \n\t" 47 "extp %[step2_0], $ac1, 31 \n\t" 48 "extp %[step2_1], $ac2, 31 \n\t" 49 50 "mtlo %[const_2_power_13], $ac3 \n\t" 51 "mthi $zero, $ac3 \n\t" 52 "madd $ac3, %[load3], %[cospi_24_64] \n\t" 53 "msub $ac3, %[load4], %[cospi_8_64] \n\t" 54 "extp %[step2_2], $ac3, 31 \n\t" 55 56 "mtlo %[const_2_power_13], $ac1 \n\t" 57 "mthi $zero, $ac1 \n\t" 58 "madd $ac1, %[load3], %[cospi_8_64] \n\t" 59 "madd $ac1, %[load4], %[cospi_24_64] \n\t" 60 "extp %[step2_3], $ac1, 31 \n\t" 61 62 "add %[step1_0], %[step2_0], %[step2_3] \n\t" 63 "add %[step1_1], %[step2_1], %[step2_2] \n\t" 64 "sub %[step1_2], %[step2_1], %[step2_2] \n\t" 65 "sub %[step1_3], %[step2_0], %[step2_3] \n\t" 66 67 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 68 [load4] "=&r"(load4), [result1] "=&r"(result1), 69 [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), 70 [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), 71 [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), 72 [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), 73 [step1_3] "=r"(step1_3) 74 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 75 [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), 76 [cospi_16_64] "r"(cospi_16_64)); 77 78 __asm__ __volatile__( 79 "lh %[load5], 2(%[input]) \n\t" 80 "lh %[load6], 30(%[input]) \n\t" 81 "lh %[load7], 18(%[input]) \n\t" 82 "lh %[load8], 14(%[input]) \n\t" 83 84 "mtlo %[const_2_power_13], $ac1 \n\t" 85 "mthi $zero, $ac1 \n\t" 86 "mtlo %[const_2_power_13], $ac3 \n\t" 87 "mthi $zero, $ac3 \n\t" 88 89 "madd $ac1, %[load5], %[cospi_30_64] \n\t" 90 "msub $ac1, %[load6], %[cospi_2_64] \n\t" 91 "extp %[result1], $ac1, 31 \n\t" 92 93 "madd $ac3, %[load7], %[cospi_14_64] \n\t" 94 "msub $ac3, %[load8], %[cospi_18_64] \n\t" 95 "extp %[result2], $ac3, 31 \n\t" 96 97 "mtlo %[const_2_power_13], $ac1 \n\t" 98 "mthi $zero, $ac1 \n\t" 99 "mtlo %[const_2_power_13], $ac2 \n\t" 100 "mthi $zero, $ac2 \n\t" 101 102 "madd $ac1, %[load7], %[cospi_18_64] \n\t" 103 "madd $ac1, %[load8], %[cospi_14_64] \n\t" 104 "extp %[result3], $ac1, 31 \n\t" 105 106 "madd $ac2, %[load5], %[cospi_2_64] \n\t" 107 "madd $ac2, %[load6], %[cospi_30_64] \n\t" 108 "extp %[result4], $ac2, 31 \n\t" 109 110 "sub %[load5], %[result1], %[result2] \n\t" 111 "sub %[load6], %[result4], %[result3] \n\t" 112 113 "mtlo %[const_2_power_13], $ac1 \n\t" 114 "mthi $zero, $ac1 \n\t" 115 "mtlo %[const_2_power_13], $ac3 \n\t" 116 "mthi $zero, $ac3 \n\t" 117 118 "madd $ac1, %[load6], %[cospi_24_64] \n\t" 119 "msub $ac1, %[load5], %[cospi_8_64] \n\t" 120 "madd $ac3, %[load5], %[cospi_24_64] \n\t" 121 "madd $ac3, %[load6], %[cospi_8_64] \n\t" 122 123 "extp %[step2_9], $ac1, 31 \n\t" 124 "extp %[step2_14], $ac3, 31 \n\t" 125 "add %[step2_8], %[result1], %[result2] \n\t" 126 "add %[step2_15], %[result4], %[result3] \n\t" 127 128 : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), 129 [load8] "=&r"(load8), [result1] "=&r"(result1), 130 [result2] "=&r"(result2), [result3] "=&r"(result3), 131 [result4] "=&r"(result4), [step2_8] "=r"(step2_8), 132 [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), 133 [step2_14] "=r"(step2_14) 134 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 135 [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), 136 [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), 137 [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); 138 139 __asm__ __volatile__( 140 "lh %[load1], 10(%[input]) \n\t" 141 "lh %[load2], 22(%[input]) \n\t" 142 "lh %[load3], 26(%[input]) \n\t" 143 "lh %[load4], 6(%[input]) \n\t" 144 145 "mtlo %[const_2_power_13], $ac1 \n\t" 146 "mthi $zero, $ac1 \n\t" 147 "mtlo %[const_2_power_13], $ac3 \n\t" 148 "mthi $zero, $ac3 \n\t" 149 150 "madd $ac1, %[load1], %[cospi_22_64] \n\t" 151 "msub $ac1, %[load2], %[cospi_10_64] \n\t" 152 "extp %[result1], $ac1, 31 \n\t" 153 154 "madd $ac3, %[load3], %[cospi_6_64] \n\t" 155 "msub $ac3, %[load4], %[cospi_26_64] \n\t" 156 "extp %[result2], $ac3, 31 \n\t" 157 158 "mtlo %[const_2_power_13], $ac1 \n\t" 159 "mthi $zero, $ac1 \n\t" 160 "mtlo %[const_2_power_13], $ac2 \n\t" 161 "mthi $zero, $ac2 \n\t" 162 163 "madd $ac1, %[load1], %[cospi_10_64] \n\t" 164 "madd $ac1, %[load2], %[cospi_22_64] \n\t" 165 "extp %[result3], $ac1, 31 \n\t" 166 167 "madd $ac2, %[load3], %[cospi_26_64] \n\t" 168 "madd $ac2, %[load4], %[cospi_6_64] \n\t" 169 "extp %[result4], $ac2, 31 \n\t" 170 171 "mtlo %[const_2_power_13], $ac1 \n\t" 172 "mthi $zero, $ac1 \n\t" 173 "mtlo %[const_2_power_13], $ac3 \n\t" 174 "mthi $zero, $ac3 \n\t" 175 176 "sub %[load1], %[result2], %[result1] \n\t" 177 "sub %[load2], %[result4], %[result3] \n\t" 178 179 "msub $ac1, %[load1], %[cospi_24_64] \n\t" 180 "msub $ac1, %[load2], %[cospi_8_64] \n\t" 181 "madd $ac3, %[load2], %[cospi_24_64] \n\t" 182 "msub $ac3, %[load1], %[cospi_8_64] \n\t" 183 184 "extp %[step2_10], $ac1, 31 \n\t" 185 "extp %[step2_13], $ac3, 31 \n\t" 186 "add %[step2_11], %[result1], %[result2] \n\t" 187 "add %[step2_12], %[result4], %[result3] \n\t" 188 189 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 190 [load4] "=&r"(load4), [result1] "=&r"(result1), 191 [result2] "=&r"(result2), [result3] "=&r"(result3), 192 [result4] "=&r"(result4), [step2_10] "=r"(step2_10), 193 [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), 194 [step2_13] "=r"(step2_13) 195 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 196 [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), 197 [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), 198 [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); 199 200 __asm__ __volatile__( 201 "lh %[load5], 4(%[input]) \n\t" 202 "lh %[load6], 28(%[input]) \n\t" 203 "lh %[load7], 20(%[input]) \n\t" 204 "lh %[load8], 12(%[input]) \n\t" 205 206 "mtlo %[const_2_power_13], $ac1 \n\t" 207 "mthi $zero, $ac1 \n\t" 208 "mtlo %[const_2_power_13], $ac3 \n\t" 209 "mthi $zero, $ac3 \n\t" 210 211 "madd $ac1, %[load5], %[cospi_28_64] \n\t" 212 "msub $ac1, %[load6], %[cospi_4_64] \n\t" 213 "extp %[result1], $ac1, 31 \n\t" 214 215 "madd $ac3, %[load7], %[cospi_12_64] \n\t" 216 "msub $ac3, %[load8], %[cospi_20_64] \n\t" 217 "extp %[result2], $ac3, 31 \n\t" 218 219 "mtlo %[const_2_power_13], $ac1 \n\t" 220 "mthi $zero, $ac1 \n\t" 221 "mtlo %[const_2_power_13], $ac2 \n\t" 222 "mthi $zero, $ac2 \n\t" 223 224 "madd $ac1, %[load7], %[cospi_20_64] \n\t" 225 "madd $ac1, %[load8], %[cospi_12_64] \n\t" 226 "extp %[result3], $ac1, 31 \n\t" 227 228 "madd $ac2, %[load5], %[cospi_4_64] \n\t" 229 "madd $ac2, %[load6], %[cospi_28_64] \n\t" 230 "extp %[result4], $ac2, 31 \n\t" 231 232 "mtlo %[const_2_power_13], $ac1 \n\t" 233 "mthi $zero, $ac1 \n\t" 234 "mtlo %[const_2_power_13], $ac3 \n\t" 235 "mthi $zero, $ac3 \n\t" 236 237 "sub %[load5], %[result4], %[result3] \n\t" 238 "sub %[load5], %[load5], %[result1] \n\t" 239 "add %[load5], %[load5], %[result2] \n\t" 240 241 "sub %[load6], %[result1], %[result2] \n\t" 242 "sub %[load6], %[load6], %[result3] \n\t" 243 "add %[load6], %[load6], %[result4] \n\t" 244 245 "madd $ac1, %[load5], %[cospi_16_64] \n\t" 246 "madd $ac3, %[load6], %[cospi_16_64] \n\t" 247 248 "extp %[step1_5], $ac1, 31 \n\t" 249 "extp %[step1_6], $ac3, 31 \n\t" 250 "add %[step1_4], %[result1], %[result2] \n\t" 251 "add %[step1_7], %[result4], %[result3] \n\t" 252 253 : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), 254 [load8] "=&r"(load8), [result1] "=&r"(result1), 255 [result2] "=&r"(result2), [result3] "=&r"(result3), 256 [result4] "=&r"(result4), [step1_4] "=r"(step1_4), 257 [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), 258 [step1_7] "=r"(step1_7) 259 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 260 [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), 261 [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), 262 [cospi_16_64] "r"(cospi_16_64)); 263 264 __asm__ __volatile__( 265 "mtlo %[const_2_power_13], $ac0 \n\t" 266 "mthi $zero, $ac0 \n\t" 267 "mtlo %[const_2_power_13], $ac1 \n\t" 268 "mthi $zero, $ac1 \n\t" 269 270 "sub %[load5], %[step2_14], %[step2_13] \n\t" 271 "sub %[load5], %[load5], %[step2_9] \n\t" 272 "add %[load5], %[load5], %[step2_10] \n\t" 273 274 "madd $ac0, %[load5], %[cospi_16_64] \n\t" 275 276 "sub %[load6], %[step2_14], %[step2_13] \n\t" 277 "sub %[load6], %[load6], %[step2_10] \n\t" 278 "add %[load6], %[load6], %[step2_9] \n\t" 279 280 "madd $ac1, %[load6], %[cospi_16_64] \n\t" 281 282 "mtlo %[const_2_power_13], $ac2 \n\t" 283 "mthi $zero, $ac2 \n\t" 284 "mtlo %[const_2_power_13], $ac3 \n\t" 285 "mthi $zero, $ac3 \n\t" 286 287 "sub %[load5], %[step2_15], %[step2_12] \n\t" 288 "sub %[load5], %[load5], %[step2_8] \n\t" 289 "add %[load5], %[load5], %[step2_11] \n\t" 290 291 "madd $ac2, %[load5], %[cospi_16_64] \n\t" 292 293 "sub %[load6], %[step2_15], %[step2_12] \n\t" 294 "sub %[load6], %[load6], %[step2_11] \n\t" 295 "add %[load6], %[load6], %[step2_8] \n\t" 296 297 "madd $ac3, %[load6], %[cospi_16_64] \n\t" 298 299 "extp %[step1_10], $ac0, 31 \n\t" 300 "extp %[step1_13], $ac1, 31 \n\t" 301 "extp %[step1_11], $ac2, 31 \n\t" 302 "extp %[step1_12], $ac3, 31 \n\t" 303 304 : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), 305 [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), 306 [step1_13] "=r"(step1_13) 307 : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), 308 [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), 309 [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), 310 [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), 311 [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); 312 313 __asm__ __volatile__( 314 "add %[load5], %[step1_0], %[step1_7] \n\t" 315 "add %[load5], %[load5], %[step2_12] \n\t" 316 "add %[load5], %[load5], %[step2_15] \n\t" 317 "add %[load6], %[step1_1], %[step1_6] \n\t" 318 "add %[load6], %[load6], %[step2_13] \n\t" 319 "add %[load6], %[load6], %[step2_14] \n\t" 320 "sh %[load5], 0(%[output]) \n\t" 321 "sh %[load6], 32(%[output]) \n\t" 322 "sub %[load5], %[step1_1], %[step1_6] \n\t" 323 "add %[load5], %[load5], %[step2_9] \n\t" 324 "add %[load5], %[load5], %[step2_10] \n\t" 325 "sub %[load6], %[step1_0], %[step1_7] \n\t" 326 "add %[load6], %[load6], %[step2_8] \n\t" 327 "add %[load6], %[load6], %[step2_11] \n\t" 328 "sh %[load5], 192(%[output]) \n\t" 329 "sh %[load6], 224(%[output]) \n\t" 330 "sub %[load5], %[step1_0], %[step1_7] \n\t" 331 "sub %[load5], %[load5], %[step2_8] \n\t" 332 "sub %[load5], %[load5], %[step2_11] \n\t" 333 "sub %[load6], %[step1_1], %[step1_6] \n\t" 334 "sub %[load6], %[load6], %[step2_9] \n\t" 335 "sub %[load6], %[load6], %[step2_10] \n\t" 336 "sh %[load5], 256(%[output]) \n\t" 337 "sh %[load6], 288(%[output]) \n\t" 338 "add %[load5], %[step1_1], %[step1_6] \n\t" 339 "sub %[load5], %[load5], %[step2_13] \n\t" 340 "sub %[load5], %[load5], %[step2_14] \n\t" 341 "add %[load6], %[step1_0], %[step1_7] \n\t" 342 "sub %[load6], %[load6], %[step2_12] \n\t" 343 "sub %[load6], %[load6], %[step2_15] \n\t" 344 "sh %[load5], 448(%[output]) \n\t" 345 "sh %[load6], 480(%[output]) \n\t" 346 347 : [load5] "=&r"(load5), [load6] "=&r"(load6) 348 : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), 349 [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), 350 [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), 351 [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), 352 [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), 353 [step2_14] "r"(step2_14), [step2_15] "r"(step2_15)); 354 355 __asm__ __volatile__( 356 "add %[load5], %[step1_2], %[step1_5] \n\t" 357 "add %[load5], %[load5], %[step1_13] \n\t" 358 "add %[load6], %[step1_3], %[step1_4] \n\t" 359 "add %[load6], %[load6], %[step1_12] \n\t" 360 "sh %[load5], 64(%[output]) \n\t" 361 "sh %[load6], 96(%[output]) \n\t" 362 "sub %[load5], %[step1_3], %[step1_4] \n\t" 363 "add %[load5], %[load5], %[step1_11] \n\t" 364 "sub %[load6], %[step1_2], %[step1_5] \n\t" 365 "add %[load6], %[load6], %[step1_10] \n\t" 366 "sh %[load5], 128(%[output]) \n\t" 367 "sh %[load6], 160(%[output]) \n\t" 368 "sub %[load5], %[step1_2], %[step1_5] \n\t" 369 "sub %[load5], %[load5], %[step1_10] \n\t" 370 "sub %[load6], %[step1_3], %[step1_4] \n\t" 371 "sub %[load6], %[load6], %[step1_11] \n\t" 372 "sh %[load5], 320(%[output]) \n\t" 373 "sh %[load6], 352(%[output]) \n\t" 374 "add %[load5], %[step1_3], %[step1_4] \n\t" 375 "sub %[load5], %[load5], %[step1_12] \n\t" 376 "add %[load6], %[step1_2], %[step1_5] \n\t" 377 "sub %[load6], %[load6], %[step1_13] \n\t" 378 "sh %[load5], 384(%[output]) \n\t" 379 "sh %[load6], 416(%[output]) \n\t" 380 381 : [load5] "=&r"(load5), [load6] "=&r"(load6) 382 : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), 383 [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), 384 [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), 385 [step1_12] "r"(step1_12), [step1_13] "r"(step1_13)); 386 387 input += 16; 388 output += 1; 389 } 390} 391 392void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { 393 int i; 394 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; 395 int step1_8, step1_9, step1_10, step1_11; 396 int step1_12, step1_13, step1_14, step1_15; 397 int step2_0, step2_1, step2_2, step2_3; 398 int step2_8, step2_9, step2_10, step2_11; 399 int step2_12, step2_13, step2_14, step2_15; 400 int load1, load2, load3, load4, load5, load6, load7, load8; 401 int result1, result2, result3, result4; 402 const int const_2_power_13 = 8192; 403 uint8_t *dest_pix; 404 uint8_t *cm = vpx_ff_cropTbl; 405 406 /* prefetch vpx_ff_cropTbl */ 407 prefetch_load(vpx_ff_cropTbl); 408 prefetch_load(vpx_ff_cropTbl + 32); 409 prefetch_load(vpx_ff_cropTbl + 64); 410 prefetch_load(vpx_ff_cropTbl + 96); 411 prefetch_load(vpx_ff_cropTbl + 128); 412 prefetch_load(vpx_ff_cropTbl + 160); 413 prefetch_load(vpx_ff_cropTbl + 192); 414 prefetch_load(vpx_ff_cropTbl + 224); 415 416 for (i = 0; i < 16; ++i) { 417 dest_pix = (dest + i); 418 __asm__ __volatile__( 419 "lh %[load1], 0(%[input]) \n\t" 420 "lh %[load2], 16(%[input]) \n\t" 421 "lh %[load3], 8(%[input]) \n\t" 422 "lh %[load4], 24(%[input]) \n\t" 423 424 "mtlo %[const_2_power_13], $ac1 \n\t" 425 "mthi $zero, $ac1 \n\t" 426 "mtlo %[const_2_power_13], $ac2 \n\t" 427 "mthi $zero, $ac2 \n\t" 428 "add %[result1], %[load1], %[load2] \n\t" 429 "sub %[result2], %[load1], %[load2] \n\t" 430 "madd $ac1, %[result1], %[cospi_16_64] \n\t" 431 "madd $ac2, %[result2], %[cospi_16_64] \n\t" 432 "extp %[step2_0], $ac1, 31 \n\t" 433 "extp %[step2_1], $ac2, 31 \n\t" 434 435 "mtlo %[const_2_power_13], $ac3 \n\t" 436 "mthi $zero, $ac3 \n\t" 437 "madd $ac3, %[load3], %[cospi_24_64] \n\t" 438 "msub $ac3, %[load4], %[cospi_8_64] \n\t" 439 "extp %[step2_2], $ac3, 31 \n\t" 440 441 "mtlo %[const_2_power_13], $ac1 \n\t" 442 "mthi $zero, $ac1 \n\t" 443 "madd $ac1, %[load3], %[cospi_8_64] \n\t" 444 "madd $ac1, %[load4], %[cospi_24_64] \n\t" 445 "extp %[step2_3], $ac1, 31 \n\t" 446 447 "add %[step1_0], %[step2_0], %[step2_3] \n\t" 448 "add %[step1_1], %[step2_1], %[step2_2] \n\t" 449 "sub %[step1_2], %[step2_1], %[step2_2] \n\t" 450 "sub %[step1_3], %[step2_0], %[step2_3] \n\t" 451 452 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 453 [load4] "=&r"(load4), [result1] "=&r"(result1), 454 [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), 455 [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), 456 [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), 457 [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), 458 [step1_3] "=r"(step1_3) 459 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 460 [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), 461 [cospi_16_64] "r"(cospi_16_64)); 462 463 __asm__ __volatile__( 464 "lh %[load5], 2(%[input]) \n\t" 465 "lh %[load6], 30(%[input]) \n\t" 466 "lh %[load7], 18(%[input]) \n\t" 467 "lh %[load8], 14(%[input]) \n\t" 468 469 "mtlo %[const_2_power_13], $ac1 \n\t" 470 "mthi $zero, $ac1 \n\t" 471 "mtlo %[const_2_power_13], $ac3 \n\t" 472 "mthi $zero, $ac3 \n\t" 473 474 "madd $ac1, %[load5], %[cospi_30_64] \n\t" 475 "msub $ac1, %[load6], %[cospi_2_64] \n\t" 476 "extp %[result1], $ac1, 31 \n\t" 477 478 "madd $ac3, %[load7], %[cospi_14_64] \n\t" 479 "msub $ac3, %[load8], %[cospi_18_64] \n\t" 480 "extp %[result2], $ac3, 31 \n\t" 481 482 "mtlo %[const_2_power_13], $ac1 \n\t" 483 "mthi $zero, $ac1 \n\t" 484 "mtlo %[const_2_power_13], $ac2 \n\t" 485 "mthi $zero, $ac2 \n\t" 486 487 "madd $ac1, %[load7], %[cospi_18_64] \n\t" 488 "madd $ac1, %[load8], %[cospi_14_64] \n\t" 489 "extp %[result3], $ac1, 31 \n\t" 490 491 "madd $ac2, %[load5], %[cospi_2_64] \n\t" 492 "madd $ac2, %[load6], %[cospi_30_64] \n\t" 493 "extp %[result4], $ac2, 31 \n\t" 494 495 "sub %[load5], %[result1], %[result2] \n\t" 496 "sub %[load6], %[result4], %[result3] \n\t" 497 498 "mtlo %[const_2_power_13], $ac1 \n\t" 499 "mthi $zero, $ac1 \n\t" 500 "mtlo %[const_2_power_13], $ac3 \n\t" 501 "mthi $zero, $ac3 \n\t" 502 503 "madd $ac1, %[load6], %[cospi_24_64] \n\t" 504 "msub $ac1, %[load5], %[cospi_8_64] \n\t" 505 "madd $ac3, %[load5], %[cospi_24_64] \n\t" 506 "madd $ac3, %[load6], %[cospi_8_64] \n\t" 507 508 "extp %[step2_9], $ac1, 31 \n\t" 509 "extp %[step2_14], $ac3, 31 \n\t" 510 "add %[step2_8], %[result1], %[result2] \n\t" 511 "add %[step2_15], %[result4], %[result3] \n\t" 512 513 : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), 514 [load8] "=&r"(load8), [result1] "=&r"(result1), 515 [result2] "=&r"(result2), [result3] "=&r"(result3), 516 [result4] "=&r"(result4), [step2_8] "=r"(step2_8), 517 [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), 518 [step2_14] "=r"(step2_14) 519 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 520 [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), 521 [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), 522 [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); 523 524 __asm__ __volatile__( 525 "lh %[load1], 10(%[input]) \n\t" 526 "lh %[load2], 22(%[input]) \n\t" 527 "lh %[load3], 26(%[input]) \n\t" 528 "lh %[load4], 6(%[input]) \n\t" 529 530 "mtlo %[const_2_power_13], $ac1 \n\t" 531 "mthi $zero, $ac1 \n\t" 532 "mtlo %[const_2_power_13], $ac3 \n\t" 533 "mthi $zero, $ac3 \n\t" 534 535 "madd $ac1, %[load1], %[cospi_22_64] \n\t" 536 "msub $ac1, %[load2], %[cospi_10_64] \n\t" 537 "extp %[result1], $ac1, 31 \n\t" 538 539 "madd $ac3, %[load3], %[cospi_6_64] \n\t" 540 "msub $ac3, %[load4], %[cospi_26_64] \n\t" 541 "extp %[result2], $ac3, 31 \n\t" 542 543 "mtlo %[const_2_power_13], $ac1 \n\t" 544 "mthi $zero, $ac1 \n\t" 545 "mtlo %[const_2_power_13], $ac2 \n\t" 546 "mthi $zero, $ac2 \n\t" 547 548 "madd $ac1, %[load1], %[cospi_10_64] \n\t" 549 "madd $ac1, %[load2], %[cospi_22_64] \n\t" 550 "extp %[result3], $ac1, 31 \n\t" 551 552 "madd $ac2, %[load3], %[cospi_26_64] \n\t" 553 "madd $ac2, %[load4], %[cospi_6_64] \n\t" 554 "extp %[result4], $ac2, 31 \n\t" 555 556 "mtlo %[const_2_power_13], $ac1 \n\t" 557 "mthi $zero, $ac1 \n\t" 558 "mtlo %[const_2_power_13], $ac3 \n\t" 559 "mthi $zero, $ac3 \n\t" 560 561 "sub %[load1], %[result2], %[result1] \n\t" 562 "sub %[load2], %[result4], %[result3] \n\t" 563 564 "msub $ac1, %[load1], %[cospi_24_64] \n\t" 565 "msub $ac1, %[load2], %[cospi_8_64] \n\t" 566 "madd $ac3, %[load2], %[cospi_24_64] \n\t" 567 "msub $ac3, %[load1], %[cospi_8_64] \n\t" 568 569 "extp %[step2_10], $ac1, 31 \n\t" 570 "extp %[step2_13], $ac3, 31 \n\t" 571 "add %[step2_11], %[result1], %[result2] \n\t" 572 "add %[step2_12], %[result4], %[result3] \n\t" 573 574 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 575 [load4] "=&r"(load4), [result1] "=&r"(result1), 576 [result2] "=&r"(result2), [result3] "=&r"(result3), 577 [result4] "=&r"(result4), [step2_10] "=r"(step2_10), 578 [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), 579 [step2_13] "=r"(step2_13) 580 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 581 [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), 582 [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), 583 [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); 584 585 __asm__ __volatile__( 586 "lh %[load5], 4(%[input]) \n\t" 587 "lh %[load6], 28(%[input]) \n\t" 588 "lh %[load7], 20(%[input]) \n\t" 589 "lh %[load8], 12(%[input]) \n\t" 590 591 "mtlo %[const_2_power_13], $ac1 \n\t" 592 "mthi $zero, $ac1 \n\t" 593 "mtlo %[const_2_power_13], $ac3 \n\t" 594 "mthi $zero, $ac3 \n\t" 595 596 "madd $ac1, %[load5], %[cospi_28_64] \n\t" 597 "msub $ac1, %[load6], %[cospi_4_64] \n\t" 598 "extp %[result1], $ac1, 31 \n\t" 599 600 "madd $ac3, %[load7], %[cospi_12_64] \n\t" 601 "msub $ac3, %[load8], %[cospi_20_64] \n\t" 602 "extp %[result2], $ac3, 31 \n\t" 603 604 "mtlo %[const_2_power_13], $ac1 \n\t" 605 "mthi $zero, $ac1 \n\t" 606 "mtlo %[const_2_power_13], $ac2 \n\t" 607 "mthi $zero, $ac2 \n\t" 608 609 "madd $ac1, %[load7], %[cospi_20_64] \n\t" 610 "madd $ac1, %[load8], %[cospi_12_64] \n\t" 611 "extp %[result3], $ac1, 31 \n\t" 612 613 "madd $ac2, %[load5], %[cospi_4_64] \n\t" 614 "madd $ac2, %[load6], %[cospi_28_64] \n\t" 615 "extp %[result4], $ac2, 31 \n\t" 616 617 "mtlo %[const_2_power_13], $ac1 \n\t" 618 "mthi $zero, $ac1 \n\t" 619 "mtlo %[const_2_power_13], $ac3 \n\t" 620 "mthi $zero, $ac3 \n\t" 621 622 "sub %[load5], %[result4], %[result3] \n\t" 623 "sub %[load5], %[load5], %[result1] \n\t" 624 "add %[load5], %[load5], %[result2] \n\t" 625 626 "sub %[load6], %[result1], %[result2] \n\t" 627 "sub %[load6], %[load6], %[result3] \n\t" 628 "add %[load6], %[load6], %[result4] \n\t" 629 630 "madd $ac1, %[load5], %[cospi_16_64] \n\t" 631 "madd $ac3, %[load6], %[cospi_16_64] \n\t" 632 633 "extp %[step1_5], $ac1, 31 \n\t" 634 "extp %[step1_6], $ac3, 31 \n\t" 635 636 "add %[step1_4], %[result1], %[result2] \n\t" 637 "add %[step1_7], %[result4], %[result3] \n\t" 638 639 : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), 640 [load8] "=&r"(load8), [result1] "=&r"(result1), 641 [result2] "=&r"(result2), [result3] "=&r"(result3), 642 [result4] "=&r"(result4), [step1_4] "=r"(step1_4), 643 [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), 644 [step1_7] "=r"(step1_7) 645 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 646 [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), 647 [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), 648 [cospi_16_64] "r"(cospi_16_64)); 649 650 __asm__ __volatile__( 651 "mtlo %[const_2_power_13], $ac0 \n\t" 652 "mthi $zero, $ac0 \n\t" 653 "mtlo %[const_2_power_13], $ac1 \n\t" 654 "mthi $zero, $ac1 \n\t" 655 656 "sub %[load5], %[step2_14], %[step2_13] \n\t" 657 "sub %[load5], %[load5], %[step2_9] \n\t" 658 "add %[load5], %[load5], %[step2_10] \n\t" 659 660 "madd $ac0, %[load5], %[cospi_16_64] \n\t" 661 662 "sub %[load6], %[step2_14], %[step2_13] \n\t" 663 "sub %[load6], %[load6], %[step2_10] \n\t" 664 "add %[load6], %[load6], %[step2_9] \n\t" 665 666 "madd $ac1, %[load6], %[cospi_16_64] \n\t" 667 668 "mtlo %[const_2_power_13], $ac2 \n\t" 669 "mthi $zero, $ac2 \n\t" 670 "mtlo %[const_2_power_13], $ac3 \n\t" 671 "mthi $zero, $ac3 \n\t" 672 673 "sub %[load5], %[step2_15], %[step2_12] \n\t" 674 "sub %[load5], %[load5], %[step2_8] \n\t" 675 "add %[load5], %[load5], %[step2_11] \n\t" 676 677 "madd $ac2, %[load5], %[cospi_16_64] \n\t" 678 679 "sub %[load6], %[step2_15], %[step2_12] \n\t" 680 "sub %[load6], %[load6], %[step2_11] \n\t" 681 "add %[load6], %[load6], %[step2_8] \n\t" 682 683 "madd $ac3, %[load6], %[cospi_16_64] \n\t" 684 685 "extp %[step1_10], $ac0, 31 \n\t" 686 "extp %[step1_13], $ac1, 31 \n\t" 687 "extp %[step1_11], $ac2, 31 \n\t" 688 "extp %[step1_12], $ac3, 31 \n\t" 689 690 : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), 691 [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), 692 [step1_13] "=r"(step1_13) 693 : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), 694 [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), 695 [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), 696 [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), 697 [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); 698 699 step1_8 = step2_8 + step2_11; 700 step1_9 = step2_9 + step2_10; 701 step1_14 = step2_13 + step2_14; 702 step1_15 = step2_12 + step2_15; 703 704 __asm__ __volatile__( 705 "lbu %[load7], 0(%[dest_pix]) \n\t" 706 "add %[load5], %[step1_0], %[step1_7] \n\t" 707 "add %[load5], %[load5], %[step1_15] \n\t" 708 "addi %[load5], %[load5], 32 \n\t" 709 "sra %[load5], %[load5], 6 \n\t" 710 "add %[load7], %[load7], %[load5] \n\t" 711 "lbux %[load5], %[load7](%[cm]) \n\t" 712 "add %[load6], %[step1_1], %[step1_6] \n\t" 713 "add %[load6], %[load6], %[step1_14] \n\t" 714 "sb %[load5], 0(%[dest_pix]) \n\t" 715 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 716 "lbu %[load8], 0(%[dest_pix]) \n\t" 717 "addi %[load6], %[load6], 32 \n\t" 718 "sra %[load6], %[load6], 6 \n\t" 719 "add %[load8], %[load8], %[load6] \n\t" 720 "lbux %[load6], %[load8](%[cm]) \n\t" 721 "sb %[load6], 0(%[dest_pix]) \n\t" 722 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 723 724 "lbu %[load7], 0(%[dest_pix]) \n\t" 725 "add %[load5], %[step1_2], %[step1_5] \n\t" 726 "add %[load5], %[load5], %[step1_13] \n\t" 727 "addi %[load5], %[load5], 32 \n\t" 728 "sra %[load5], %[load5], 6 \n\t" 729 "add %[load7], %[load7], %[load5] \n\t" 730 "lbux %[load5], %[load7](%[cm]) \n\t" 731 "add %[load6], %[step1_3], %[step1_4] \n\t" 732 "add %[load6], %[load6], %[step1_12] \n\t" 733 "sb %[load5], 0(%[dest_pix]) \n\t" 734 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 735 "lbu %[load8], 0(%[dest_pix]) \n\t" 736 "addi %[load6], %[load6], 32 \n\t" 737 "sra %[load6], %[load6], 6 \n\t" 738 "add %[load8], %[load8], %[load6] \n\t" 739 "lbux %[load6], %[load8](%[cm]) \n\t" 740 "sb %[load6], 0(%[dest_pix]) \n\t" 741 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 742 743 "lbu %[load7], 0(%[dest_pix]) \n\t" 744 "sub %[load5], %[step1_3], %[step1_4] \n\t" 745 "add %[load5], %[load5], %[step1_11] \n\t" 746 "addi %[load5], %[load5], 32 \n\t" 747 "sra %[load5], %[load5], 6 \n\t" 748 "add %[load7], %[load7], %[load5] \n\t" 749 "lbux %[load5], %[load7](%[cm]) \n\t" 750 "sub %[load6], %[step1_2], %[step1_5] \n\t" 751 "add %[load6], %[load6], %[step1_10] \n\t" 752 "sb %[load5], 0(%[dest_pix]) \n\t" 753 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 754 "lbu %[load8], 0(%[dest_pix]) \n\t" 755 "addi %[load6], %[load6], 32 \n\t" 756 "sra %[load6], %[load6], 6 \n\t" 757 "add %[load8], %[load8], %[load6] \n\t" 758 "lbux %[load6], %[load8](%[cm]) \n\t" 759 "sb %[load6], 0(%[dest_pix]) \n\t" 760 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 761 762 "sub %[load5], %[step1_1], %[step1_6] \n\t" 763 "lbu %[load7], 0(%[dest_pix]) \n\t" 764 "add %[load5], %[load5], %[step1_9] \n\t" 765 "addi %[load5], %[load5], 32 \n\t" 766 "sra %[load5], %[load5], 6 \n\t" 767 "add %[load7], %[load7], %[load5] \n\t" 768 "lbux %[load5], %[load7](%[cm]) \n\t" 769 "sub %[load6], %[step1_0], %[step1_7] \n\t" 770 "add %[load6], %[load6], %[step1_8] \n\t" 771 "sb %[load5], 0(%[dest_pix]) \n\t" 772 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 773 "lbu %[load8], 0(%[dest_pix]) \n\t" 774 "addi %[load6], %[load6], 32 \n\t" 775 "sra %[load6], %[load6], 6 \n\t" 776 "add %[load8], %[load8], %[load6] \n\t" 777 "lbux %[load6], %[load8](%[cm]) \n\t" 778 "sb %[load6], 0(%[dest_pix]) \n\t" 779 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 780 781 "lbu %[load7], 0(%[dest_pix]) \n\t" 782 "sub %[load5], %[step1_0], %[step1_7] \n\t" 783 "sub %[load5], %[load5], %[step1_8] \n\t" 784 "addi %[load5], %[load5], 32 \n\t" 785 "sra %[load5], %[load5], 6 \n\t" 786 "add %[load7], %[load7], %[load5] \n\t" 787 "lbux %[load5], %[load7](%[cm]) \n\t" 788 "sub %[load6], %[step1_1], %[step1_6] \n\t" 789 "sub %[load6], %[load6], %[step1_9] \n\t" 790 "sb %[load5], 0(%[dest_pix]) \n\t" 791 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 792 "lbu %[load8], 0(%[dest_pix]) \n\t" 793 "addi %[load6], %[load6], 32 \n\t" 794 "sra %[load6], %[load6], 6 \n\t" 795 "add %[load8], %[load8], %[load6] \n\t" 796 "lbux %[load6], %[load8](%[cm]) \n\t" 797 "sb %[load6], 0(%[dest_pix]) \n\t" 798 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 799 800 "lbu %[load7], 0(%[dest_pix]) \n\t" 801 "sub %[load5], %[step1_2], %[step1_5] \n\t" 802 "sub %[load5], %[load5], %[step1_10] \n\t" 803 "addi %[load5], %[load5], 32 \n\t" 804 "sra %[load5], %[load5], 6 \n\t" 805 "add %[load7], %[load7], %[load5] \n\t" 806 "lbux %[load5], %[load7](%[cm]) \n\t" 807 "sub %[load6], %[step1_3], %[step1_4] \n\t" 808 "sub %[load6], %[load6], %[step1_11] \n\t" 809 "sb %[load5], 0(%[dest_pix]) \n\t" 810 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 811 "lbu %[load8], 0(%[dest_pix]) \n\t" 812 "addi %[load6], %[load6], 32 \n\t" 813 "sra %[load6], %[load6], 6 \n\t" 814 "add %[load8], %[load8], %[load6] \n\t" 815 "lbux %[load6], %[load8](%[cm]) \n\t" 816 "sb %[load6], 0(%[dest_pix]) \n\t" 817 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 818 819 "lbu %[load7], 0(%[dest_pix]) \n\t" 820 "add %[load5], %[step1_3], %[step1_4] \n\t" 821 "sub %[load5], %[load5], %[step1_12] \n\t" 822 "addi %[load5], %[load5], 32 \n\t" 823 "sra %[load5], %[load5], 6 \n\t" 824 "add %[load7], %[load7], %[load5] \n\t" 825 "lbux %[load5], %[load7](%[cm]) \n\t" 826 "add %[load6], %[step1_2], %[step1_5] \n\t" 827 "sub %[load6], %[load6], %[step1_13] \n\t" 828 "sb %[load5], 0(%[dest_pix]) \n\t" 829 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 830 "lbu %[load8], 0(%[dest_pix]) \n\t" 831 "addi %[load6], %[load6], 32 \n\t" 832 "sra %[load6], %[load6], 6 \n\t" 833 "add %[load8], %[load8], %[load6] \n\t" 834 "lbux %[load6], %[load8](%[cm]) \n\t" 835 "sb %[load6], 0(%[dest_pix]) \n\t" 836 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 837 838 "lbu %[load7], 0(%[dest_pix]) \n\t" 839 "add %[load5], %[step1_1], %[step1_6] \n\t" 840 "sub %[load5], %[load5], %[step1_14] \n\t" 841 "addi %[load5], %[load5], 32 \n\t" 842 "sra %[load5], %[load5], 6 \n\t" 843 "add %[load7], %[load7], %[load5] \n\t" 844 "lbux %[load5], %[load7](%[cm]) \n\t" 845 "add %[load6], %[step1_0], %[step1_7] \n\t" 846 "sub %[load6], %[load6], %[step1_15] \n\t" 847 "sb %[load5], 0(%[dest_pix]) \n\t" 848 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 849 "lbu %[load8], 0(%[dest_pix]) \n\t" 850 "addi %[load6], %[load6], 32 \n\t" 851 "sra %[load6], %[load6], 6 \n\t" 852 "add %[load8], %[load8], %[load6] \n\t" 853 "lbux %[load6], %[load8](%[cm]) \n\t" 854 "sb %[load6], 0(%[dest_pix]) \n\t" 855 856 : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), 857 [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix) 858 : 859 [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0), 860 [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), 861 [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), 862 [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), 863 [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), 864 [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), 865 [step1_14] "r"(step1_14), [step1_15] "r"(step1_15)); 866 867 input += 16; 868 } 869} 870 871void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, 872 int stride) { 873 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); 874 uint32_t pos = 45; 875 876 /* bit positon for extract from acc */ 877 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); 878 879 // First transform rows 880 idct16_rows_dspr2(input, out, 16); 881 882 // Then transform columns and add to dest 883 idct16_cols_add_blk_dspr2(out, dest, stride); 884} 885 886void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, 887 int stride) { 888 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); 889 int16_t *outptr = out; 890 uint32_t i; 891 uint32_t pos = 45; 892 893 /* bit positon for extract from acc */ 894 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); 895 896 // First transform rows. Since all non-zero dct coefficients are in 897 // upper-left 4x4 area, we only need to calculate first 4 rows here. 898 idct16_rows_dspr2(input, outptr, 4); 899 900 outptr += 4; 901 for (i = 0; i < 6; ++i) { 902 __asm__ __volatile__( 903 "sw $zero, 0(%[outptr]) \n\t" 904 "sw $zero, 32(%[outptr]) \n\t" 905 "sw $zero, 64(%[outptr]) \n\t" 906 "sw $zero, 96(%[outptr]) \n\t" 907 "sw $zero, 128(%[outptr]) \n\t" 908 "sw $zero, 160(%[outptr]) \n\t" 909 "sw $zero, 192(%[outptr]) \n\t" 910 "sw $zero, 224(%[outptr]) \n\t" 911 "sw $zero, 256(%[outptr]) \n\t" 912 "sw $zero, 288(%[outptr]) \n\t" 913 "sw $zero, 320(%[outptr]) \n\t" 914 "sw $zero, 352(%[outptr]) \n\t" 915 "sw $zero, 384(%[outptr]) \n\t" 916 "sw $zero, 416(%[outptr]) \n\t" 917 "sw $zero, 448(%[outptr]) \n\t" 918 "sw $zero, 480(%[outptr]) \n\t" 919 920 : 921 : [outptr] "r"(outptr)); 922 923 outptr += 2; 924 } 925 926 // Then transform columns 927 idct16_cols_add_blk_dspr2(out, dest, stride); 928} 929 930void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, 931 int stride) { 932 uint32_t pos = 45; 933 int32_t out; 934 int32_t r; 935 int32_t a1, absa1; 936 int32_t vector_a1; 937 int32_t t1, t2, t3, t4; 938 int32_t vector_1, vector_2, vector_3, vector_4; 939 940 /* bit positon for extract from acc */ 941 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 942 943 : 944 : [pos] "r"(pos)); 945 946 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); 947 __asm__ __volatile__( 948 "addi %[out], %[out], 32 \n\t" 949 "sra %[a1], %[out], 6 \n\t" 950 951 : [out] "+r"(out), [a1] "=r"(a1) 952 :); 953 954 if (a1 < 0) { 955 /* use quad-byte 956 * input and output memory are four byte aligned */ 957 __asm__ __volatile__( 958 "abs %[absa1], %[a1] \n\t" 959 "replv.qb %[vector_a1], %[absa1] \n\t" 960 961 : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) 962 : [a1] "r"(a1)); 963 964 for (r = 16; r--;) { 965 __asm__ __volatile__( 966 "lw %[t1], 0(%[dest]) \n\t" 967 "lw %[t2], 4(%[dest]) \n\t" 968 "lw %[t3], 8(%[dest]) \n\t" 969 "lw %[t4], 12(%[dest]) \n\t" 970 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 971 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 972 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 973 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 974 "sw %[vector_1], 0(%[dest]) \n\t" 975 "sw %[vector_2], 4(%[dest]) \n\t" 976 "sw %[vector_3], 8(%[dest]) \n\t" 977 "sw %[vector_4], 12(%[dest]) \n\t" 978 "add %[dest], %[dest], %[stride] \n\t" 979 980 : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), 981 [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), 982 [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), 983 [dest] "+&r"(dest) 984 : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); 985 } 986 } else if (a1 > 255) { 987 int32_t a11, a12, vector_a11, vector_a12; 988 989 /* use quad-byte 990 * input and output memory are four byte aligned */ 991 a11 = a1 >> 1; 992 a12 = a1 - a11; 993 __asm__ __volatile__( 994 "replv.qb %[vector_a11], %[a11] \n\t" 995 "replv.qb %[vector_a12], %[a12] \n\t" 996 997 : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) 998 : [a11] "r"(a11), [a12] "r"(a12)); 999 1000 for (r = 16; r--;) { 1001 __asm__ __volatile__( 1002 "lw %[t1], 0(%[dest]) \n\t" 1003 "lw %[t2], 4(%[dest]) \n\t" 1004 "lw %[t3], 8(%[dest]) \n\t" 1005 "lw %[t4], 12(%[dest]) \n\t" 1006 "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" 1007 "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" 1008 "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" 1009 "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" 1010 "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" 1011 "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" 1012 "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" 1013 "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" 1014 "sw %[vector_1], 0(%[dest]) \n\t" 1015 "sw %[vector_2], 4(%[dest]) \n\t" 1016 "sw %[vector_3], 8(%[dest]) \n\t" 1017 "sw %[vector_4], 12(%[dest]) \n\t" 1018 "add %[dest], %[dest], %[stride] \n\t" 1019 1020 : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), 1021 [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), 1022 [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), 1023 [dest] "+&r"(dest) 1024 : [stride] "r"(stride), [vector_a11] "r"(vector_a11), 1025 [vector_a12] "r"(vector_a12)); 1026 } 1027 } else { 1028 /* use quad-byte 1029 * input and output memory are four byte aligned */ 1030 __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" 1031 1032 : [vector_a1] "=r"(vector_a1) 1033 : [a1] "r"(a1)); 1034 1035 for (r = 16; r--;) { 1036 __asm__ __volatile__( 1037 "lw %[t1], 0(%[dest]) \n\t" 1038 "lw %[t2], 4(%[dest]) \n\t" 1039 "lw %[t3], 8(%[dest]) \n\t" 1040 "lw %[t4], 12(%[dest]) \n\t" 1041 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 1042 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 1043 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 1044 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1045 "sw %[vector_1], 0(%[dest]) \n\t" 1046 "sw %[vector_2], 4(%[dest]) \n\t" 1047 "sw %[vector_3], 8(%[dest]) \n\t" 1048 "sw %[vector_4], 12(%[dest]) \n\t" 1049 "add %[dest], %[dest], %[stride] \n\t" 1050 1051 : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), 1052 [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), 1053 [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), 1054 [dest] "+&r"(dest) 1055 : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); 1056 } 1057 } 1058} 1059 1060void iadst16_dspr2(const int16_t *input, int16_t *output) { 1061 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; 1062 1063 int x0 = input[15]; 1064 int x1 = input[0]; 1065 int x2 = input[13]; 1066 int x3 = input[2]; 1067 int x4 = input[11]; 1068 int x5 = input[4]; 1069 int x6 = input[9]; 1070 int x7 = input[6]; 1071 int x8 = input[7]; 1072 int x9 = input[8]; 1073 int x10 = input[5]; 1074 int x11 = input[10]; 1075 int x12 = input[3]; 1076 int x13 = input[12]; 1077 int x14 = input[1]; 1078 int x15 = input[14]; 1079 1080 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | 1081 x13 | x14 | x15)) { 1082 output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = 1083 output[6] = output[7] = output[8] = output[9] = output[10] = 1084 output[11] = output[12] = output[13] = output[14] = output[15] = 0; 1085 return; 1086 } 1087 1088 // stage 1 1089 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 1090 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 1091 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 1092 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 1093 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 1094 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 1095 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 1096 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 1097 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 1098 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 1099 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 1100 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 1101 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 1102 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 1103 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 1104 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 1105 1106 x0 = dct_const_round_shift(s0 + s8); 1107 x1 = dct_const_round_shift(s1 + s9); 1108 x2 = dct_const_round_shift(s2 + s10); 1109 x3 = dct_const_round_shift(s3 + s11); 1110 x4 = dct_const_round_shift(s4 + s12); 1111 x5 = dct_const_round_shift(s5 + s13); 1112 x6 = dct_const_round_shift(s6 + s14); 1113 x7 = dct_const_round_shift(s7 + s15); 1114 x8 = dct_const_round_shift(s0 - s8); 1115 x9 = dct_const_round_shift(s1 - s9); 1116 x10 = dct_const_round_shift(s2 - s10); 1117 x11 = dct_const_round_shift(s3 - s11); 1118 x12 = dct_const_round_shift(s4 - s12); 1119 x13 = dct_const_round_shift(s5 - s13); 1120 x14 = dct_const_round_shift(s6 - s14); 1121 x15 = dct_const_round_shift(s7 - s15); 1122 1123 // stage 2 1124 s0 = x0; 1125 s1 = x1; 1126 s2 = x2; 1127 s3 = x3; 1128 s4 = x4; 1129 s5 = x5; 1130 s6 = x6; 1131 s7 = x7; 1132 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 1133 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 1134 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 1135 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 1136 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; 1137 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 1138 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; 1139 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 1140 1141 x0 = s0 + s4; 1142 x1 = s1 + s5; 1143 x2 = s2 + s6; 1144 x3 = s3 + s7; 1145 x4 = s0 - s4; 1146 x5 = s1 - s5; 1147 x6 = s2 - s6; 1148 x7 = s3 - s7; 1149 x8 = dct_const_round_shift(s8 + s12); 1150 x9 = dct_const_round_shift(s9 + s13); 1151 x10 = dct_const_round_shift(s10 + s14); 1152 x11 = dct_const_round_shift(s11 + s15); 1153 x12 = dct_const_round_shift(s8 - s12); 1154 x13 = dct_const_round_shift(s9 - s13); 1155 x14 = dct_const_round_shift(s10 - s14); 1156 x15 = dct_const_round_shift(s11 - s15); 1157 1158 // stage 3 1159 s0 = x0; 1160 s1 = x1; 1161 s2 = x2; 1162 s3 = x3; 1163 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 1164 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 1165 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; 1166 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 1167 s8 = x8; 1168 s9 = x9; 1169 s10 = x10; 1170 s11 = x11; 1171 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 1172 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 1173 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; 1174 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 1175 1176 x0 = s0 + s2; 1177 x1 = s1 + s3; 1178 x2 = s0 - s2; 1179 x3 = s1 - s3; 1180 x4 = dct_const_round_shift(s4 + s6); 1181 x5 = dct_const_round_shift(s5 + s7); 1182 x6 = dct_const_round_shift(s4 - s6); 1183 x7 = dct_const_round_shift(s5 - s7); 1184 x8 = s8 + s10; 1185 x9 = s9 + s11; 1186 x10 = s8 - s10; 1187 x11 = s9 - s11; 1188 x12 = dct_const_round_shift(s12 + s14); 1189 x13 = dct_const_round_shift(s13 + s15); 1190 x14 = dct_const_round_shift(s12 - s14); 1191 x15 = dct_const_round_shift(s13 - s15); 1192 1193 // stage 4 1194 s2 = (-cospi_16_64) * (x2 + x3); 1195 s3 = cospi_16_64 * (x2 - x3); 1196 s6 = cospi_16_64 * (x6 + x7); 1197 s7 = cospi_16_64 * (-x6 + x7); 1198 s10 = cospi_16_64 * (x10 + x11); 1199 s11 = cospi_16_64 * (-x10 + x11); 1200 s14 = (-cospi_16_64) * (x14 + x15); 1201 s15 = cospi_16_64 * (x14 - x15); 1202 1203 x2 = dct_const_round_shift(s2); 1204 x3 = dct_const_round_shift(s3); 1205 x6 = dct_const_round_shift(s6); 1206 x7 = dct_const_round_shift(s7); 1207 x10 = dct_const_round_shift(s10); 1208 x11 = dct_const_round_shift(s11); 1209 x14 = dct_const_round_shift(s14); 1210 x15 = dct_const_round_shift(s15); 1211 1212 output[0] = x0; 1213 output[1] = -x8; 1214 output[2] = x12; 1215 output[3] = -x4; 1216 output[4] = x6; 1217 output[5] = x14; 1218 output[6] = x10; 1219 output[7] = x2; 1220 output[8] = x3; 1221 output[9] = x11; 1222 output[10] = x15; 1223 output[11] = x7; 1224 output[12] = x5; 1225 output[13] = -x13; 1226 output[14] = x9; 1227 output[15] = -x1; 1228} 1229 1230#endif // HAVE_DSPR2 1231