1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_config.h" 12#include "vpx_dsp/mips/inv_txfm_dspr2.h" 13#include "vpx_dsp/txfm_common.h" 14 15#if HAVE_DSPR2 16void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { 17 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; 18 int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; 19 int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; 20 int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; 21 int step1_28, step1_29, step1_30, step1_31; 22 int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; 23 int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; 24 int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; 25 int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; 26 int step2_28, step2_29, step2_30, step2_31; 27 int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; 28 int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; 29 int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; 30 int step3_29, step3_30, step3_31; 31 int temp0, temp1, temp2, temp3; 32 int load1, load2, load3, load4; 33 int result1, result2; 34 int i; 35 uint8_t *dest_pix, *dest_pix1; 36 const int const_2_power_13 = 8192; 37 uint8_t *cm = vpx_ff_cropTbl; 38 39 /* prefetch vpx_ff_cropTbl */ 40 prefetch_load(vpx_ff_cropTbl); 41 prefetch_load(vpx_ff_cropTbl + 32); 42 prefetch_load(vpx_ff_cropTbl + 64); 43 prefetch_load(vpx_ff_cropTbl + 96); 44 prefetch_load(vpx_ff_cropTbl + 128); 45 prefetch_load(vpx_ff_cropTbl + 160); 46 prefetch_load(vpx_ff_cropTbl + 192); 47 prefetch_load(vpx_ff_cropTbl + 224); 48 49 for (i = 0; i < 32; ++i) { 50 dest_pix = dest + i; 51 dest_pix1 = dest + i + 31 * stride; 52 53 __asm__ __volatile__( 54 "lh %[load1], 2(%[input]) \n\t" 55 "lh %[load2], 62(%[input]) \n\t" 56 "lh %[load3], 34(%[input]) \n\t" 57 "lh %[load4], 30(%[input]) \n\t" 58 59 "mtlo %[const_2_power_13], $ac1 \n\t" 60 "mthi $zero, $ac1 \n\t" 61 "mtlo %[const_2_power_13], $ac3 \n\t" 62 "mthi $zero, $ac3 \n\t" 63 64 "madd $ac1, %[load1], %[cospi_31_64] \n\t" 65 "msub $ac1, %[load2], %[cospi_1_64] \n\t" 66 "extp %[temp0], $ac1, 31 \n\t" 67 68 "madd $ac3, %[load1], %[cospi_1_64] \n\t" 69 "madd $ac3, %[load2], %[cospi_31_64] \n\t" 70 "extp %[temp3], $ac3, 31 \n\t" 71 72 "mtlo %[const_2_power_13], $ac1 \n\t" 73 "mthi $zero, $ac1 \n\t" 74 "mtlo %[const_2_power_13], $ac2 \n\t" 75 "mthi $zero, $ac2 \n\t" 76 77 "madd $ac2, %[load3], %[cospi_15_64] \n\t" 78 "msub $ac2, %[load4], %[cospi_17_64] \n\t" 79 "extp %[temp1], $ac2, 31 \n\t" 80 81 "madd $ac1, %[load3], %[cospi_17_64] \n\t" 82 "madd $ac1, %[load4], %[cospi_15_64] \n\t" 83 "extp %[temp2], $ac1, 31 \n\t" 84 85 "mtlo %[const_2_power_13], $ac1 \n\t" 86 "mthi $zero, $ac1 \n\t" 87 "mtlo %[const_2_power_13], $ac3 \n\t" 88 "mthi $zero, $ac3 \n\t" 89 90 "sub %[load1], %[temp3], %[temp2] \n\t" 91 "sub %[load2], %[temp0], %[temp1] \n\t" 92 93 "madd $ac1, %[load1], %[cospi_28_64] \n\t" 94 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 95 "madd $ac3, %[load1], %[cospi_4_64] \n\t" 96 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 97 98 "extp %[step1_17], $ac1, 31 \n\t" 99 "extp %[step1_30], $ac3, 31 \n\t" 100 "add %[step1_16], %[temp0], %[temp1] \n\t" 101 "add %[step1_31], %[temp2], %[temp3] \n\t" 102 103 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 104 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 105 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), 106 [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17), 107 [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31) 108 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 109 [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), 110 [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), 111 [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); 112 113 __asm__ __volatile__( 114 "lh %[load1], 18(%[input]) \n\t" 115 "lh %[load2], 46(%[input]) \n\t" 116 "lh %[load3], 50(%[input]) \n\t" 117 "lh %[load4], 14(%[input]) \n\t" 118 119 "mtlo %[const_2_power_13], $ac1 \n\t" 120 "mthi $zero, $ac1 \n\t" 121 "mtlo %[const_2_power_13], $ac3 \n\t" 122 "mthi $zero, $ac3 \n\t" 123 124 "madd $ac1, %[load1], %[cospi_23_64] \n\t" 125 "msub $ac1, %[load2], %[cospi_9_64] \n\t" 126 "extp %[temp0], $ac1, 31 \n\t" 127 128 "madd $ac3, %[load1], %[cospi_9_64] \n\t" 129 "madd $ac3, %[load2], %[cospi_23_64] \n\t" 130 "extp %[temp3], $ac3, 31 \n\t" 131 132 "mtlo %[const_2_power_13], $ac1 \n\t" 133 "mthi $zero, $ac1 \n\t" 134 "mtlo %[const_2_power_13], $ac2 \n\t" 135 "mthi $zero, $ac2 \n\t" 136 137 "madd $ac2, %[load3], %[cospi_7_64] \n\t" 138 "msub $ac2, %[load4], %[cospi_25_64] \n\t" 139 "extp %[temp1], $ac2, 31 \n\t" 140 141 "madd $ac1, %[load3], %[cospi_25_64] \n\t" 142 "madd $ac1, %[load4], %[cospi_7_64] \n\t" 143 "extp %[temp2], $ac1, 31 \n\t" 144 145 "mtlo %[const_2_power_13], $ac1 \n\t" 146 "mthi $zero, $ac1 \n\t" 147 "mtlo %[const_2_power_13], $ac3 \n\t" 148 "mthi $zero, $ac3 \n\t" 149 150 "sub %[load1], %[temp1], %[temp0] \n\t" 151 "sub %[load2], %[temp2], %[temp3] \n\t" 152 153 "msub $ac1, %[load1], %[cospi_28_64] \n\t" 154 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 155 "msub $ac3, %[load1], %[cospi_4_64] \n\t" 156 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 157 158 "extp %[step1_18], $ac1, 31 \n\t" 159 "extp %[step1_29], $ac3, 31 \n\t" 160 "add %[step1_19], %[temp0], %[temp1] \n\t" 161 "add %[step1_28], %[temp2], %[temp3] \n\t" 162 163 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 164 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 165 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), 166 [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19), 167 [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29) 168 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 169 [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), 170 [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), 171 [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); 172 173 __asm__ __volatile__( 174 "lh %[load1], 10(%[input]) \n\t" 175 "lh %[load2], 54(%[input]) \n\t" 176 "lh %[load3], 42(%[input]) \n\t" 177 "lh %[load4], 22(%[input]) \n\t" 178 179 "mtlo %[const_2_power_13], $ac1 \n\t" 180 "mthi $zero, $ac1 \n\t" 181 "mtlo %[const_2_power_13], $ac3 \n\t" 182 "mthi $zero, $ac3 \n\t" 183 184 "madd $ac1, %[load1], %[cospi_27_64] \n\t" 185 "msub $ac1, %[load2], %[cospi_5_64] \n\t" 186 "extp %[temp0], $ac1, 31 \n\t" 187 188 "madd $ac3, %[load1], %[cospi_5_64] \n\t" 189 "madd $ac3, %[load2], %[cospi_27_64] \n\t" 190 "extp %[temp3], $ac3, 31 \n\t" 191 192 "mtlo %[const_2_power_13], $ac1 \n\t" 193 "mthi $zero, $ac1 \n\t" 194 "mtlo %[const_2_power_13], $ac2 \n\t" 195 "mthi $zero, $ac2 \n\t" 196 197 "madd $ac2, %[load3], %[cospi_11_64] \n\t" 198 "msub $ac2, %[load4], %[cospi_21_64] \n\t" 199 "extp %[temp1], $ac2, 31 \n\t" 200 201 "madd $ac1, %[load3], %[cospi_21_64] \n\t" 202 "madd $ac1, %[load4], %[cospi_11_64] \n\t" 203 "extp %[temp2], $ac1, 31 \n\t" 204 205 "mtlo %[const_2_power_13], $ac1 \n\t" 206 "mthi $zero, $ac1 \n\t" 207 "mtlo %[const_2_power_13], $ac3 \n\t" 208 "mthi $zero, $ac3 \n\t" 209 210 "sub %[load1], %[temp0], %[temp1] \n\t" 211 "sub %[load2], %[temp3], %[temp2] \n\t" 212 213 "madd $ac1, %[load2], %[cospi_12_64] \n\t" 214 "msub $ac1, %[load1], %[cospi_20_64] \n\t" 215 "madd $ac3, %[load1], %[cospi_12_64] \n\t" 216 "madd $ac3, %[load2], %[cospi_20_64] \n\t" 217 218 "extp %[step1_21], $ac1, 31 \n\t" 219 "extp %[step1_26], $ac3, 31 \n\t" 220 "add %[step1_20], %[temp0], %[temp1] \n\t" 221 "add %[step1_27], %[temp2], %[temp3] \n\t" 222 223 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 224 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 225 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), 226 [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21), 227 [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27) 228 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 229 [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), 230 [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), 231 [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); 232 233 __asm__ __volatile__( 234 "lh %[load1], 26(%[input]) \n\t" 235 "lh %[load2], 38(%[input]) \n\t" 236 "lh %[load3], 58(%[input]) \n\t" 237 "lh %[load4], 6(%[input]) \n\t" 238 239 "mtlo %[const_2_power_13], $ac1 \n\t" 240 "mthi $zero, $ac1 \n\t" 241 "mtlo %[const_2_power_13], $ac3 \n\t" 242 "mthi $zero, $ac3 \n\t" 243 244 "madd $ac1, %[load1], %[cospi_19_64] \n\t" 245 "msub $ac1, %[load2], %[cospi_13_64] \n\t" 246 "extp %[temp0], $ac1, 31 \n\t" 247 "madd $ac3, %[load1], %[cospi_13_64] \n\t" 248 "madd $ac3, %[load2], %[cospi_19_64] \n\t" 249 "extp %[temp3], $ac3, 31 \n\t" 250 251 "mtlo %[const_2_power_13], $ac1 \n\t" 252 "mthi $zero, $ac1 \n\t" 253 "mtlo %[const_2_power_13], $ac2 \n\t" 254 "mthi $zero, $ac2 \n\t" 255 256 "madd $ac2, %[load3], %[cospi_3_64] \n\t" 257 "msub $ac2, %[load4], %[cospi_29_64] \n\t" 258 "extp %[temp1], $ac2, 31 \n\t" 259 "madd $ac1, %[load3], %[cospi_29_64] \n\t" 260 "madd $ac1, %[load4], %[cospi_3_64] \n\t" 261 "extp %[temp2], $ac1, 31 \n\t" 262 263 "mtlo %[const_2_power_13], $ac1 \n\t" 264 "mthi $zero, $ac1 \n\t" 265 "mtlo %[const_2_power_13], $ac3 \n\t" 266 "mthi $zero, $ac3 \n\t" 267 268 "sub %[load1], %[temp1], %[temp0] \n\t" 269 "sub %[load2], %[temp2], %[temp3] \n\t" 270 "msub $ac1, %[load1], %[cospi_12_64] \n\t" 271 "msub $ac1, %[load2], %[cospi_20_64] \n\t" 272 "msub $ac3, %[load1], %[cospi_20_64] \n\t" 273 "madd $ac3, %[load2], %[cospi_12_64] \n\t" 274 "extp %[step1_22], $ac1, 31 \n\t" 275 "extp %[step1_25], $ac3, 31 \n\t" 276 "add %[step1_23], %[temp0], %[temp1] \n\t" 277 "add %[step1_24], %[temp2], %[temp3] \n\t" 278 279 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 280 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 281 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), 282 [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23), 283 [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25) 284 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 285 [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), 286 [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), 287 [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); 288 289 __asm__ __volatile__( 290 "lh %[load1], 4(%[input]) \n\t" 291 "lh %[load2], 60(%[input]) \n\t" 292 "lh %[load3], 36(%[input]) \n\t" 293 "lh %[load4], 28(%[input]) \n\t" 294 295 "mtlo %[const_2_power_13], $ac1 \n\t" 296 "mthi $zero, $ac1 \n\t" 297 "mtlo %[const_2_power_13], $ac3 \n\t" 298 "mthi $zero, $ac3 \n\t" 299 300 "madd $ac1, %[load1], %[cospi_30_64] \n\t" 301 "msub $ac1, %[load2], %[cospi_2_64] \n\t" 302 "extp %[temp0], $ac1, 31 \n\t" 303 "madd $ac3, %[load1], %[cospi_2_64] \n\t" 304 "madd $ac3, %[load2], %[cospi_30_64] \n\t" 305 "extp %[temp3], $ac3, 31 \n\t" 306 307 "mtlo %[const_2_power_13], $ac1 \n\t" 308 "mthi $zero, $ac1 \n\t" 309 "mtlo %[const_2_power_13], $ac2 \n\t" 310 "mthi $zero, $ac2 \n\t" 311 312 "madd $ac2, %[load3], %[cospi_14_64] \n\t" 313 "msub $ac2, %[load4], %[cospi_18_64] \n\t" 314 "extp %[temp1], $ac2, 31 \n\t" 315 "madd $ac1, %[load3], %[cospi_18_64] \n\t" 316 "madd $ac1, %[load4], %[cospi_14_64] \n\t" 317 "extp %[temp2], $ac1, 31 \n\t" 318 319 "mtlo %[const_2_power_13], $ac1 \n\t" 320 "mthi $zero, $ac1 \n\t" 321 "mtlo %[const_2_power_13], $ac3 \n\t" 322 "mthi $zero, $ac3 \n\t" 323 324 "sub %[load1], %[temp0], %[temp1] \n\t" 325 "sub %[load2], %[temp3], %[temp2] \n\t" 326 "msub $ac1, %[load1], %[cospi_8_64] \n\t" 327 "madd $ac1, %[load2], %[cospi_24_64] \n\t" 328 "madd $ac3, %[load1], %[cospi_24_64] \n\t" 329 "madd $ac3, %[load2], %[cospi_8_64] \n\t" 330 "extp %[step2_9], $ac1, 31 \n\t" 331 "extp %[step2_14], $ac3, 31 \n\t" 332 "add %[step2_8], %[temp0], %[temp1] \n\t" 333 "add %[step2_15], %[temp2], %[temp3] \n\t" 334 335 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 336 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 337 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8), 338 [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14), 339 [step2_15] "=&r"(step2_15) 340 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 341 [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), 342 [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), 343 [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); 344 345 __asm__ __volatile__( 346 "lh %[load1], 20(%[input]) \n\t" 347 "lh %[load2], 44(%[input]) \n\t" 348 "lh %[load3], 52(%[input]) \n\t" 349 "lh %[load4], 12(%[input]) \n\t" 350 351 "mtlo %[const_2_power_13], $ac1 \n\t" 352 "mthi $zero, $ac1 \n\t" 353 "mtlo %[const_2_power_13], $ac3 \n\t" 354 "mthi $zero, $ac3 \n\t" 355 356 "madd $ac1, %[load1], %[cospi_22_64] \n\t" 357 "msub $ac1, %[load2], %[cospi_10_64] \n\t" 358 "extp %[temp0], $ac1, 31 \n\t" 359 "madd $ac3, %[load1], %[cospi_10_64] \n\t" 360 "madd $ac3, %[load2], %[cospi_22_64] \n\t" 361 "extp %[temp3], $ac3, 31 \n\t" 362 363 "mtlo %[const_2_power_13], $ac1 \n\t" 364 "mthi $zero, $ac1 \n\t" 365 "mtlo %[const_2_power_13], $ac2 \n\t" 366 "mthi $zero, $ac2 \n\t" 367 368 "madd $ac2, %[load3], %[cospi_6_64] \n\t" 369 "msub $ac2, %[load4], %[cospi_26_64] \n\t" 370 "extp %[temp1], $ac2, 31 \n\t" 371 "madd $ac1, %[load3], %[cospi_26_64] \n\t" 372 "madd $ac1, %[load4], %[cospi_6_64] \n\t" 373 "extp %[temp2], $ac1, 31 \n\t" 374 375 "mtlo %[const_2_power_13], $ac1 \n\t" 376 "mthi $zero, $ac1 \n\t" 377 "mtlo %[const_2_power_13], $ac3 \n\t" 378 "mthi $zero, $ac3 \n\t" 379 380 "sub %[load1], %[temp1], %[temp0] \n\t" 381 "sub %[load2], %[temp2], %[temp3] \n\t" 382 "msub $ac1, %[load1], %[cospi_24_64] \n\t" 383 "msub $ac1, %[load2], %[cospi_8_64] \n\t" 384 "madd $ac3, %[load2], %[cospi_24_64] \n\t" 385 "msub $ac3, %[load1], %[cospi_8_64] \n\t" 386 "extp %[step2_10], $ac1, 31 \n\t" 387 "extp %[step2_13], $ac3, 31 \n\t" 388 "add %[step2_11], %[temp0], %[temp1] \n\t" 389 "add %[step2_12], %[temp2], %[temp3] \n\t" 390 391 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 392 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 393 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), 394 [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11), 395 [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13) 396 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 397 [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), 398 [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), 399 [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); 400 401 __asm__ __volatile__( 402 "mtlo %[const_2_power_13], $ac0 \n\t" 403 "mthi $zero, $ac0 \n\t" 404 "sub %[temp0], %[step2_14], %[step2_13] \n\t" 405 "sub %[temp0], %[temp0], %[step2_9] \n\t" 406 "add %[temp0], %[temp0], %[step2_10] \n\t" 407 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 408 "mtlo %[const_2_power_13], $ac1 \n\t" 409 "mthi $zero, $ac1 \n\t" 410 "sub %[temp1], %[step2_14], %[step2_13] \n\t" 411 "add %[temp1], %[temp1], %[step2_9] \n\t" 412 "sub %[temp1], %[temp1], %[step2_10] \n\t" 413 "madd $ac1, %[temp1], %[cospi_16_64] \n\t" 414 "mtlo %[const_2_power_13], $ac2 \n\t" 415 "mthi $zero, $ac2 \n\t" 416 "sub %[temp0], %[step2_15], %[step2_12] \n\t" 417 "sub %[temp0], %[temp0], %[step2_8] \n\t" 418 "add %[temp0], %[temp0], %[step2_11] \n\t" 419 "madd $ac2, %[temp0], %[cospi_16_64] \n\t" 420 "mtlo %[const_2_power_13], $ac3 \n\t" 421 "mthi $zero, $ac3 \n\t" 422 "sub %[temp1], %[step2_15], %[step2_12] \n\t" 423 "add %[temp1], %[temp1], %[step2_8] \n\t" 424 "sub %[temp1], %[temp1], %[step2_11] \n\t" 425 "madd $ac3, %[temp1], %[cospi_16_64] \n\t" 426 427 "add %[step3_8], %[step2_8], %[step2_11] \n\t" 428 "add %[step3_9], %[step2_9], %[step2_10] \n\t" 429 "add %[step3_14], %[step2_13], %[step2_14] \n\t" 430 "add %[step3_15], %[step2_12], %[step2_15] \n\t" 431 "extp %[step3_10], $ac0, 31 \n\t" 432 "extp %[step3_13], $ac1, 31 \n\t" 433 "extp %[step3_11], $ac2, 31 \n\t" 434 "extp %[step3_12], $ac3, 31 \n\t" 435 436 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8), 437 [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10), 438 [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12), 439 [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14), 440 [step3_15] "=&r"(step3_15) 441 : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), 442 [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), 443 [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), 444 [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), 445 [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); 446 447 __asm__ __volatile__( 448 "mtlo %[const_2_power_13], $ac0 \n\t" 449 "mthi $zero, $ac0 \n\t" 450 "mtlo %[const_2_power_13], $ac1 \n\t" 451 "mthi $zero, $ac1 \n\t" 452 "sub %[temp0], %[step1_17], %[step1_18] \n\t" 453 "sub %[temp1], %[step1_30], %[step1_29] \n\t" 454 "add %[step3_17], %[step1_17], %[step1_18] \n\t" 455 "add %[step3_30], %[step1_30], %[step1_29] \n\t" 456 457 "msub $ac0, %[temp0], %[cospi_8_64] \n\t" 458 "madd $ac0, %[temp1], %[cospi_24_64] \n\t" 459 "extp %[step3_18], $ac0, 31 \n\t" 460 "madd $ac1, %[temp0], %[cospi_24_64] \n\t" 461 "madd $ac1, %[temp1], %[cospi_8_64] \n\t" 462 "extp %[step3_29], $ac1, 31 \n\t" 463 464 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 465 [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29), 466 [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30) 467 : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17), 468 [step1_18] "r"(step1_18), [step1_30] "r"(step1_30), 469 [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64), 470 [cospi_8_64] "r"(cospi_8_64)); 471 472 __asm__ __volatile__( 473 "mtlo %[const_2_power_13], $ac0 \n\t" 474 "mthi $zero, $ac0 \n\t" 475 "mtlo %[const_2_power_13], $ac1 \n\t" 476 "mthi $zero, $ac1 \n\t" 477 "sub %[temp0], %[step1_16], %[step1_19] \n\t" 478 "sub %[temp1], %[step1_31], %[step1_28] \n\t" 479 "add %[step3_16], %[step1_16], %[step1_19] \n\t" 480 "add %[step3_31], %[step1_31], %[step1_28] \n\t" 481 482 "msub $ac0, %[temp0], %[cospi_8_64] \n\t" 483 "madd $ac0, %[temp1], %[cospi_24_64] \n\t" 484 "extp %[step3_19], $ac0, 31 \n\t" 485 "madd $ac1, %[temp0], %[cospi_24_64] \n\t" 486 "madd $ac1, %[temp1], %[cospi_8_64] \n\t" 487 "extp %[step3_28], $ac1, 31 \n\t" 488 489 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 490 [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31), 491 [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28) 492 : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16), 493 [step1_19] "r"(step1_19), [step1_31] "r"(step1_31), 494 [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64), 495 [cospi_8_64] "r"(cospi_8_64)); 496 497 __asm__ __volatile__( 498 "mtlo %[const_2_power_13], $ac0 \n\t" 499 "mthi $zero, $ac0 \n\t" 500 "mtlo %[const_2_power_13], $ac1 \n\t" 501 "mthi $zero, $ac1 \n\t" 502 "sub %[temp0], %[step1_23], %[step1_20] \n\t" 503 "sub %[temp1], %[step1_24], %[step1_27] \n\t" 504 "add %[step3_23], %[step1_23], %[step1_20] \n\t" 505 "add %[step3_24], %[step1_24], %[step1_27] \n\t" 506 507 "msub $ac0, %[temp0], %[cospi_8_64] \n\t" 508 "madd $ac0, %[temp1], %[cospi_24_64] \n\t" 509 "extp %[step3_27], $ac0, 31 \n\t" 510 "msub $ac1, %[temp0], %[cospi_24_64] \n\t" 511 "msub $ac1, %[temp1], %[cospi_8_64] \n\t" 512 "extp %[step3_20], $ac1, 31 \n\t" 513 514 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 515 [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24), 516 [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27) 517 : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23), 518 [step1_20] "r"(step1_20), [step1_24] "r"(step1_24), 519 [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64), 520 [cospi_8_64] "r"(cospi_8_64)); 521 522 __asm__ __volatile__( 523 "mtlo %[const_2_power_13], $ac0 \n\t" 524 "mthi $zero, $ac0 \n\t" 525 "mtlo %[const_2_power_13], $ac1 \n\t" 526 "mthi $zero, $ac1 \n\t" 527 "sub %[temp0], %[step1_22], %[step1_21] \n\t" 528 "sub %[temp1], %[step1_25], %[step1_26] \n\t" 529 "add %[step3_22], %[step1_22], %[step1_21] \n\t" 530 "add %[step3_25], %[step1_25], %[step1_26] \n\t" 531 532 "msub $ac0, %[temp0], %[cospi_24_64] \n\t" 533 "msub $ac0, %[temp1], %[cospi_8_64] \n\t" 534 "extp %[step3_21], $ac0, 31 \n\t" 535 "msub $ac1, %[temp0], %[cospi_8_64] \n\t" 536 "madd $ac1, %[temp1], %[cospi_24_64] \n\t" 537 "extp %[step3_26], $ac1, 31 \n\t" 538 539 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 540 [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25), 541 [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26) 542 : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22), 543 [step1_21] "r"(step1_21), [step1_25] "r"(step1_25), 544 [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64), 545 [cospi_8_64] "r"(cospi_8_64)); 546 547 __asm__ __volatile__( 548 "add %[step2_16], %[step3_16], %[step3_23] \n\t" 549 "add %[step2_17], %[step3_17], %[step3_22] \n\t" 550 "add %[step2_18], %[step3_18], %[step3_21] \n\t" 551 "add %[step2_19], %[step3_19], %[step3_20] \n\t" 552 "sub %[step2_20], %[step3_19], %[step3_20] \n\t" 553 "sub %[step2_21], %[step3_18], %[step3_21] \n\t" 554 "sub %[step2_22], %[step3_17], %[step3_22] \n\t" 555 "sub %[step2_23], %[step3_16], %[step3_23] \n\t" 556 557 : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17), 558 [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19), 559 [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21), 560 [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23) 561 : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23), 562 [step3_17] "r"(step3_17), [step3_22] "r"(step3_22), 563 [step3_18] "r"(step3_18), [step3_21] "r"(step3_21), 564 [step3_19] "r"(step3_19), [step3_20] "r"(step3_20)); 565 566 __asm__ __volatile__( 567 "sub %[step2_24], %[step3_31], %[step3_24] \n\t" 568 "sub %[step2_25], %[step3_30], %[step3_25] \n\t" 569 "sub %[step2_26], %[step3_29], %[step3_26] \n\t" 570 "sub %[step2_27], %[step3_28], %[step3_27] \n\t" 571 "add %[step2_28], %[step3_28], %[step3_27] \n\t" 572 "add %[step2_29], %[step3_29], %[step3_26] \n\t" 573 "add %[step2_30], %[step3_30], %[step3_25] \n\t" 574 "add %[step2_31], %[step3_31], %[step3_24] \n\t" 575 576 : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28), 577 [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29), 578 [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30), 579 [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31) 580 : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24), 581 [step3_30] "r"(step3_30), [step3_25] "r"(step3_25), 582 [step3_29] "r"(step3_29), [step3_26] "r"(step3_26), 583 [step3_28] "r"(step3_28), [step3_27] "r"(step3_27)); 584 585 __asm__ __volatile__( 586 "lh %[load1], 0(%[input]) \n\t" 587 "lh %[load2], 32(%[input]) \n\t" 588 "lh %[load3], 16(%[input]) \n\t" 589 "lh %[load4], 48(%[input]) \n\t" 590 591 "mtlo %[const_2_power_13], $ac1 \n\t" 592 "mthi $zero, $ac1 \n\t" 593 "mtlo %[const_2_power_13], $ac2 \n\t" 594 "mthi $zero, $ac2 \n\t" 595 "add %[result1], %[load1], %[load2] \n\t" 596 "sub %[result2], %[load1], %[load2] \n\t" 597 "madd $ac1, %[result1], %[cospi_16_64] \n\t" 598 "madd $ac2, %[result2], %[cospi_16_64] \n\t" 599 "extp %[temp0], $ac1, 31 \n\t" 600 "extp %[temp1], $ac2, 31 \n\t" 601 602 "mtlo %[const_2_power_13], $ac3 \n\t" 603 "mthi $zero, $ac3 \n\t" 604 "madd $ac3, %[load3], %[cospi_24_64] \n\t" 605 "msub $ac3, %[load4], %[cospi_8_64] \n\t" 606 "extp %[temp2], $ac3, 31 \n\t" 607 "mtlo %[const_2_power_13], $ac1 \n\t" 608 "mthi $zero, $ac1 \n\t" 609 "madd $ac1, %[load3], %[cospi_8_64] \n\t" 610 "madd $ac1, %[load4], %[cospi_24_64] \n\t" 611 "extp %[temp3], $ac1, 31 \n\t" 612 "add %[step1_0], %[temp0], %[temp3] \n\t" 613 "add %[step1_1], %[temp1], %[temp2] \n\t" 614 "sub %[step1_2], %[temp1], %[temp2] \n\t" 615 "sub %[step1_3], %[temp0], %[temp3] \n\t" 616 617 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 618 [load4] "=&r"(load4), [result1] "=&r"(result1), 619 [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 620 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0), 621 [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2), 622 [step1_3] "=&r"(step1_3) 623 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 624 [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), 625 [cospi_16_64] "r"(cospi_16_64)); 626 627 __asm__ __volatile__( 628 "lh %[load1], 8(%[input]) \n\t" 629 "lh %[load2], 56(%[input]) \n\t" 630 "lh %[load3], 40(%[input]) \n\t" 631 "lh %[load4], 24(%[input]) \n\t" 632 633 "mtlo %[const_2_power_13], $ac1 \n\t" 634 "mthi $zero, $ac1 \n\t" 635 "mtlo %[const_2_power_13], $ac3 \n\t" 636 "mthi $zero, $ac3 \n\t" 637 638 "madd $ac1, %[load1], %[cospi_28_64] \n\t" 639 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 640 "extp %[temp0], $ac1, 31 \n\t" 641 "madd $ac3, %[load1], %[cospi_4_64] \n\t" 642 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 643 "extp %[temp3], $ac3, 31 \n\t" 644 645 "mtlo %[const_2_power_13], $ac1 \n\t" 646 "mthi $zero, $ac1 \n\t" 647 "mtlo %[const_2_power_13], $ac2 \n\t" 648 "mthi $zero, $ac2 \n\t" 649 650 "madd $ac2, %[load3], %[cospi_12_64] \n\t" 651 "msub $ac2, %[load4], %[cospi_20_64] \n\t" 652 "extp %[temp1], $ac2, 31 \n\t" 653 "madd $ac1, %[load3], %[cospi_20_64] \n\t" 654 "madd $ac1, %[load4], %[cospi_12_64] \n\t" 655 "extp %[temp2], $ac1, 31 \n\t" 656 657 "mtlo %[const_2_power_13], $ac1 \n\t" 658 "mthi $zero, $ac1 \n\t" 659 "mtlo %[const_2_power_13], $ac3 \n\t" 660 "mthi $zero, $ac3 \n\t" 661 662 "sub %[load1], %[temp3], %[temp2] \n\t" 663 "sub %[load1], %[load1], %[temp0] \n\t" 664 "add %[load1], %[load1], %[temp1] \n\t" 665 "sub %[load2], %[temp0], %[temp1] \n\t" 666 "sub %[load2], %[load2], %[temp2] \n\t" 667 "add %[load2], %[load2], %[temp3] \n\t" 668 "madd $ac1, %[load1], %[cospi_16_64] \n\t" 669 "madd $ac3, %[load2], %[cospi_16_64] \n\t" 670 671 "extp %[step1_5], $ac1, 31 \n\t" 672 "extp %[step1_6], $ac3, 31 \n\t" 673 "add %[step1_4], %[temp0], %[temp1] \n\t" 674 "add %[step1_7], %[temp3], %[temp2] \n\t" 675 676 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 677 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 678 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4), 679 [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6), 680 [step1_7] "=&r"(step1_7) 681 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 682 [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), 683 [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), 684 [cospi_16_64] "r"(cospi_16_64)); 685 686 __asm__ __volatile__( 687 "add %[step2_0], %[step1_0], %[step1_7] \n\t" 688 "add %[step2_1], %[step1_1], %[step1_6] \n\t" 689 "add %[step2_2], %[step1_2], %[step1_5] \n\t" 690 "add %[step2_3], %[step1_3], %[step1_4] \n\t" 691 "sub %[step2_4], %[step1_3], %[step1_4] \n\t" 692 "sub %[step2_5], %[step1_2], %[step1_5] \n\t" 693 "sub %[step2_6], %[step1_1], %[step1_6] \n\t" 694 "sub %[step2_7], %[step1_0], %[step1_7] \n\t" 695 696 : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4), 697 [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5), 698 [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6), 699 [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7) 700 : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7), 701 [step1_1] "r"(step1_1), [step1_6] "r"(step1_6), 702 [step1_2] "r"(step1_2), [step1_5] "r"(step1_5), 703 [step1_3] "r"(step1_3), [step1_4] "r"(step1_4)); 704 705 // stage 7 706 __asm__ __volatile__( 707 "add %[step1_0], %[step2_0], %[step3_15] \n\t" 708 "add %[step1_1], %[step2_1], %[step3_14] \n\t" 709 "add %[step1_2], %[step2_2], %[step3_13] \n\t" 710 "add %[step1_3], %[step2_3], %[step3_12] \n\t" 711 "sub %[step1_12], %[step2_3], %[step3_12] \n\t" 712 "sub %[step1_13], %[step2_2], %[step3_13] \n\t" 713 "sub %[step1_14], %[step2_1], %[step3_14] \n\t" 714 "sub %[step1_15], %[step2_0], %[step3_15] \n\t" 715 716 : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12), 717 [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13), 718 [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14), 719 [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15) 720 : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15), 721 [step2_1] "r"(step2_1), [step3_14] "r"(step3_14), 722 [step2_2] "r"(step2_2), [step3_13] "r"(step3_13), 723 [step2_3] "r"(step2_3), [step3_12] "r"(step3_12)); 724 725 __asm__ __volatile__( 726 "add %[step1_4], %[step2_4], %[step3_11] \n\t" 727 "add %[step1_5], %[step2_5], %[step3_10] \n\t" 728 "add %[step1_6], %[step2_6], %[step3_9] \n\t" 729 "add %[step1_7], %[step2_7], %[step3_8] \n\t" 730 "sub %[step1_8], %[step2_7], %[step3_8] \n\t" 731 "sub %[step1_9], %[step2_6], %[step3_9] \n\t" 732 "sub %[step1_10], %[step2_5], %[step3_10] \n\t" 733 "sub %[step1_11], %[step2_4], %[step3_11] \n\t" 734 735 : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8), 736 [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9), 737 [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10), 738 [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11) 739 : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11), 740 [step2_5] "r"(step2_5), [step3_10] "r"(step3_10), 741 [step2_6] "r"(step2_6), [step3_9] "r"(step3_9), 742 [step2_7] "r"(step2_7), [step3_8] "r"(step3_8)); 743 744 __asm__ __volatile__( 745 "sub %[temp0], %[step2_27], %[step2_20] \n\t" 746 "add %[temp1], %[step2_27], %[step2_20] \n\t" 747 "sub %[temp2], %[step2_26], %[step2_21] \n\t" 748 "add %[temp3], %[step2_26], %[step2_21] \n\t" 749 750 "mtlo %[const_2_power_13], $ac0 \n\t" 751 "mthi $zero, $ac0 \n\t" 752 "mtlo %[const_2_power_13], $ac1 \n\t" 753 "mthi $zero, $ac1 \n\t" 754 "mtlo %[const_2_power_13], $ac2 \n\t" 755 "mthi $zero, $ac2 \n\t" 756 "mtlo %[const_2_power_13], $ac3 \n\t" 757 "mthi $zero, $ac3 \n\t" 758 759 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 760 "madd $ac1, %[temp1], %[cospi_16_64] \n\t" 761 "madd $ac2, %[temp2], %[cospi_16_64] \n\t" 762 "madd $ac3, %[temp3], %[cospi_16_64] \n\t" 763 764 "extp %[step1_20], $ac0, 31 \n\t" 765 "extp %[step1_27], $ac1, 31 \n\t" 766 "extp %[step1_21], $ac2, 31 \n\t" 767 "extp %[step1_26], $ac3, 31 \n\t" 768 769 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 770 [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20), 771 [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21), 772 [step1_26] "=&r"(step1_26) 773 : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), 774 [step2_27] "r"(step2_27), [step2_21] "r"(step2_21), 775 [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64)); 776 777 __asm__ __volatile__( 778 "sub %[temp0], %[step2_25], %[step2_22] \n\t" 779 "add %[temp1], %[step2_25], %[step2_22] \n\t" 780 "sub %[temp2], %[step2_24], %[step2_23] \n\t" 781 "add %[temp3], %[step2_24], %[step2_23] \n\t" 782 783 "mtlo %[const_2_power_13], $ac0 \n\t" 784 "mthi $zero, $ac0 \n\t" 785 "mtlo %[const_2_power_13], $ac1 \n\t" 786 "mthi $zero, $ac1 \n\t" 787 "mtlo %[const_2_power_13], $ac2 \n\t" 788 "mthi $zero, $ac2 \n\t" 789 "mtlo %[const_2_power_13], $ac3 \n\t" 790 "mthi $zero, $ac3 \n\t" 791 792 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 793 "madd $ac1, %[temp1], %[cospi_16_64] \n\t" 794 "madd $ac2, %[temp2], %[cospi_16_64] \n\t" 795 "madd $ac3, %[temp3], %[cospi_16_64] \n\t" 796 797 "extp %[step1_22], $ac0, 31 \n\t" 798 "extp %[step1_25], $ac1, 31 \n\t" 799 "extp %[step1_23], $ac2, 31 \n\t" 800 "extp %[step1_24], $ac3, 31 \n\t" 801 802 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 803 [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22), 804 [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23), 805 [step1_24] "=&r"(step1_24) 806 : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22), 807 [step2_25] "r"(step2_25), [step2_23] "r"(step2_23), 808 [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64)); 809 810 __asm__ __volatile__( 811 "lbu %[temp2], 0(%[dest_pix]) \n\t" 812 "add %[temp0], %[step1_0], %[step2_31] \n\t" 813 "addi %[temp0], %[temp0], 32 \n\t" 814 "sra %[temp0], %[temp0], 6 \n\t" 815 "add %[temp2], %[temp2], %[temp0] \n\t" 816 "lbux %[temp0], %[temp2](%[cm]) \n\t" 817 "add %[temp1], %[step1_1], %[step2_30] \n\t" 818 "sb %[temp0], 0(%[dest_pix]) \n\t" 819 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 820 "lbu %[temp3], 0(%[dest_pix]) \n\t" 821 "addi %[temp1], %[temp1], 32 \n\t" 822 "sra %[temp1], %[temp1], 6 \n\t" 823 "add %[temp3], %[temp3], %[temp1] \n\t" 824 "lbux %[temp1], %[temp3](%[cm]) \n\t" 825 "sb %[temp1], 0(%[dest_pix]) \n\t" 826 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 827 828 "lbu %[temp2], 0(%[dest_pix]) \n\t" 829 "add %[temp0], %[step1_2], %[step2_29] \n\t" 830 "addi %[temp0], %[temp0], 32 \n\t" 831 "sra %[temp0], %[temp0], 6 \n\t" 832 "add %[temp2], %[temp2], %[temp0] \n\t" 833 "lbux %[temp0], %[temp2](%[cm]) \n\t" 834 "add %[temp1], %[step1_3], %[step2_28] \n\t" 835 "sb %[temp0], 0(%[dest_pix]) \n\t" 836 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 837 "lbu %[temp3], 0(%[dest_pix]) \n\t" 838 "addi %[temp1], %[temp1], 32 \n\t" 839 "sra %[temp1], %[temp1], 6 \n\t" 840 "add %[temp3], %[temp3], %[temp1] \n\t" 841 "lbux %[temp1], %[temp3](%[cm]) \n\t" 842 "sb %[temp1], 0(%[dest_pix]) \n\t" 843 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 844 845 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 846 [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) 847 : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0), 848 [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), 849 [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), 850 [step2_29] "r"(step2_29), [step2_30] "r"(step2_30), 851 [step2_31] "r"(step2_31)); 852 853 step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); 854 step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); 855 step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); 856 step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); 857 858 __asm__ __volatile__( 859 "lbu %[temp2], 0(%[dest_pix1]) \n\t" 860 "add %[temp2], %[temp2], %[step3_15] \n\t" 861 "lbux %[temp0], %[temp2](%[cm]) \n\t" 862 "sb %[temp0], 0(%[dest_pix1]) \n\t" 863 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 864 "lbu %[temp3], 0(%[dest_pix1]) \n\t" 865 "add %[temp3], %[temp3], %[step3_14] \n\t" 866 "lbux %[temp1], %[temp3](%[cm]) \n\t" 867 "sb %[temp1], 0(%[dest_pix1]) \n\t" 868 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 869 870 "lbu %[temp2], 0(%[dest_pix1]) \n\t" 871 "add %[temp2], %[temp2], %[step3_13] \n\t" 872 "lbux %[temp0], %[temp2](%[cm]) \n\t" 873 "sb %[temp0], 0(%[dest_pix1]) \n\t" 874 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 875 "lbu %[temp3], 0(%[dest_pix1]) \n\t" 876 "add %[temp3], %[temp3], %[step3_12] \n\t" 877 "lbux %[temp1], %[temp3](%[cm]) \n\t" 878 "sb %[temp1], 0(%[dest_pix1]) \n\t" 879 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 880 881 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 882 [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) 883 : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), 884 [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), 885 [step3_15] "r"(step3_15)); 886 887 __asm__ __volatile__( 888 "lbu %[temp2], 0(%[dest_pix]) \n\t" 889 "add %[temp0], %[step1_4], %[step1_27] \n\t" 890 "addi %[temp0], %[temp0], 32 \n\t" 891 "sra %[temp0], %[temp0], 6 \n\t" 892 "add %[temp2], %[temp2], %[temp0] \n\t" 893 "lbux %[temp0], %[temp2](%[cm]) \n\t" 894 "add %[temp1], %[step1_5], %[step1_26] \n\t" 895 "sb %[temp0], 0(%[dest_pix]) \n\t" 896 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 897 "lbu %[temp3], 0(%[dest_pix]) \n\t" 898 "addi %[temp1], %[temp1], 32 \n\t" 899 "sra %[temp1], %[temp1], 6 \n\t" 900 "add %[temp3], %[temp3], %[temp1] \n\t" 901 "lbux %[temp1], %[temp3](%[cm]) \n\t" 902 "sb %[temp1], 0(%[dest_pix]) \n\t" 903 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 904 905 "lbu %[temp2], 0(%[dest_pix]) \n\t" 906 "add %[temp0], %[step1_6], %[step1_25] \n\t" 907 "addi %[temp0], %[temp0], 32 \n\t" 908 "sra %[temp0], %[temp0], 6 \n\t" 909 "add %[temp2], %[temp2], %[temp0] \n\t" 910 "lbux %[temp0], %[temp2](%[cm]) \n\t" 911 "add %[temp1], %[step1_7], %[step1_24] \n\t" 912 "sb %[temp0], 0(%[dest_pix]) \n\t" 913 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 914 "lbu %[temp3], 0(%[dest_pix]) \n\t" 915 "addi %[temp1], %[temp1], 32 \n\t" 916 "sra %[temp1], %[temp1], 6 \n\t" 917 "add %[temp3], %[temp3], %[temp1] \n\t" 918 "lbux %[temp1], %[temp3](%[cm]) \n\t" 919 "sb %[temp1], 0(%[dest_pix]) \n\t" 920 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 921 922 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 923 [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) 924 : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4), 925 [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), 926 [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), 927 [step1_25] "r"(step1_25), [step1_26] "r"(step1_26), 928 [step1_27] "r"(step1_27)); 929 930 step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); 931 step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); 932 step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); 933 step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); 934 935 __asm__ __volatile__( 936 "lbu %[temp2], 0(%[dest_pix1]) \n\t" 937 "add %[temp2], %[temp2], %[step3_15] \n\t" 938 "lbux %[temp0], %[temp2](%[cm]) \n\t" 939 "sb %[temp0], 0(%[dest_pix1]) \n\t" 940 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 941 "lbu %[temp3], 0(%[dest_pix1]) \n\t" 942 "add %[temp3], %[temp3], %[step3_14] \n\t" 943 "lbux %[temp1], %[temp3](%[cm]) \n\t" 944 "sb %[temp1], 0(%[dest_pix1]) \n\t" 945 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 946 947 "lbu %[temp2], 0(%[dest_pix1]) \n\t" 948 "add %[temp2], %[temp2], %[step3_13] \n\t" 949 "lbux %[temp0], %[temp2](%[cm]) \n\t" 950 "sb %[temp0], 0(%[dest_pix1]) \n\t" 951 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 952 "lbu %[temp3], 0(%[dest_pix1]) \n\t" 953 "add %[temp3], %[temp3], %[step3_12] \n\t" 954 "lbux %[temp1], %[temp3](%[cm]) \n\t" 955 "sb %[temp1], 0(%[dest_pix1]) \n\t" 956 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 957 958 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 959 [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) 960 : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), 961 [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), 962 [step3_15] "r"(step3_15)); 963 964 __asm__ __volatile__( 965 "lbu %[temp2], 0(%[dest_pix]) \n\t" 966 "add %[temp0], %[step1_8], %[step1_23] \n\t" 967 "addi %[temp0], %[temp0], 32 \n\t" 968 "sra %[temp0], %[temp0], 6 \n\t" 969 "add %[temp2], %[temp2], %[temp0] \n\t" 970 "lbux %[temp0], %[temp2](%[cm]) \n\t" 971 "add %[temp1], %[step1_9], %[step1_22] \n\t" 972 "sb %[temp0], 0(%[dest_pix]) \n\t" 973 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 974 "lbu %[temp3], 0(%[dest_pix]) \n\t" 975 "addi %[temp1], %[temp1], 32 \n\t" 976 "sra %[temp1], %[temp1], 6 \n\t" 977 "add %[temp3], %[temp3], %[temp1] \n\t" 978 "lbux %[temp1], %[temp3](%[cm]) \n\t" 979 "sb %[temp1], 0(%[dest_pix]) \n\t" 980 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 981 982 "lbu %[temp2], 0(%[dest_pix]) \n\t" 983 "add %[temp0], %[step1_10], %[step1_21] \n\t" 984 "addi %[temp0], %[temp0], 32 \n\t" 985 "sra %[temp0], %[temp0], 6 \n\t" 986 "add %[temp2], %[temp2], %[temp0] \n\t" 987 "lbux %[temp0], %[temp2](%[cm]) \n\t" 988 "add %[temp1], %[step1_11], %[step1_20] \n\t" 989 "sb %[temp0], 0(%[dest_pix]) \n\t" 990 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 991 "lbu %[temp3], 0(%[dest_pix]) \n\t" 992 "addi %[temp1], %[temp1], 32 \n\t" 993 "sra %[temp1], %[temp1], 6 \n\t" 994 "add %[temp3], %[temp3], %[temp1] \n\t" 995 "lbux %[temp1], %[temp3](%[cm]) \n\t" 996 "sb %[temp1], 0(%[dest_pix]) \n\t" 997 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 998 999 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 1000 [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) 1001 : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8), 1002 [step1_9] "r"(step1_9), [step1_10] "r"(step1_10), 1003 [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), 1004 [step1_21] "r"(step1_21), [step1_22] "r"(step1_22), 1005 [step1_23] "r"(step1_23)); 1006 1007 step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); 1008 step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); 1009 step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); 1010 step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); 1011 1012 __asm__ __volatile__( 1013 "lbu %[temp2], 0(%[dest_pix1]) \n\t" 1014 "add %[temp2], %[temp2], %[step3_15] \n\t" 1015 "lbux %[temp0], %[temp2](%[cm]) \n\t" 1016 "sb %[temp0], 0(%[dest_pix1]) \n\t" 1017 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 1018 "lbu %[temp3], 0(%[dest_pix1]) \n\t" 1019 "add %[temp3], %[temp3], %[step3_14] \n\t" 1020 "lbux %[temp1], %[temp3](%[cm]) \n\t" 1021 "sb %[temp1], 0(%[dest_pix1]) \n\t" 1022 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 1023 1024 "lbu %[temp2], 0(%[dest_pix1]) \n\t" 1025 "add %[temp2], %[temp2], %[step3_13] \n\t" 1026 "lbux %[temp0], %[temp2](%[cm]) \n\t" 1027 "sb %[temp0], 0(%[dest_pix1]) \n\t" 1028 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 1029 "lbu %[temp3], 0(%[dest_pix1]) \n\t" 1030 "add %[temp3], %[temp3], %[step3_12] \n\t" 1031 "lbux %[temp1], %[temp3](%[cm]) \n\t" 1032 "sb %[temp1], 0(%[dest_pix1]) \n\t" 1033 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 1034 1035 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 1036 [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) 1037 : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), 1038 [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), 1039 [step3_15] "r"(step3_15)); 1040 1041 __asm__ __volatile__( 1042 "lbu %[temp2], 0(%[dest_pix]) \n\t" 1043 "add %[temp0], %[step1_12], %[step2_19] \n\t" 1044 "addi %[temp0], %[temp0], 32 \n\t" 1045 "sra %[temp0], %[temp0], 6 \n\t" 1046 "add %[temp2], %[temp2], %[temp0] \n\t" 1047 "lbux %[temp0], %[temp2](%[cm]) \n\t" 1048 "add %[temp1], %[step1_13], %[step2_18] \n\t" 1049 "sb %[temp0], 0(%[dest_pix]) \n\t" 1050 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 1051 "lbu %[temp3], 0(%[dest_pix]) \n\t" 1052 "addi %[temp1], %[temp1], 32 \n\t" 1053 "sra %[temp1], %[temp1], 6 \n\t" 1054 "add %[temp3], %[temp3], %[temp1] \n\t" 1055 "lbux %[temp1], %[temp3](%[cm]) \n\t" 1056 "sb %[temp1], 0(%[dest_pix]) \n\t" 1057 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 1058 1059 "lbu %[temp2], 0(%[dest_pix]) \n\t" 1060 "add %[temp0], %[step1_14], %[step2_17] \n\t" 1061 "addi %[temp0], %[temp0], 32 \n\t" 1062 "sra %[temp0], %[temp0], 6 \n\t" 1063 "add %[temp2], %[temp2], %[temp0] \n\t" 1064 "lbux %[temp0], %[temp2](%[cm]) \n\t" 1065 "add %[temp1], %[step1_15], %[step2_16] \n\t" 1066 "sb %[temp0], 0(%[dest_pix]) \n\t" 1067 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 1068 "lbu %[temp3], 0(%[dest_pix]) \n\t" 1069 "addi %[temp1], %[temp1], 32 \n\t" 1070 "sra %[temp1], %[temp1], 6 \n\t" 1071 "add %[temp3], %[temp3], %[temp1] \n\t" 1072 "lbux %[temp1], %[temp3](%[cm]) \n\t" 1073 "sb %[temp1], 0(%[dest_pix]) \n\t" 1074 1075 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 1076 [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) 1077 : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12), 1078 [step1_13] "r"(step1_13), [step1_14] "r"(step1_14), 1079 [step1_15] "r"(step1_15), [step2_16] "r"(step2_16), 1080 [step2_17] "r"(step2_17), [step2_18] "r"(step2_18), 1081 [step2_19] "r"(step2_19)); 1082 1083 step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); 1084 step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); 1085 step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); 1086 step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); 1087 1088 __asm__ __volatile__( 1089 "lbu %[temp2], 0(%[dest_pix1]) \n\t" 1090 "add %[temp2], %[temp2], %[step3_15] \n\t" 1091 "lbux %[temp0], %[temp2](%[cm]) \n\t" 1092 "sb %[temp0], 0(%[dest_pix1]) \n\t" 1093 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 1094 "lbu %[temp3], 0(%[dest_pix1]) \n\t" 1095 "add %[temp3], %[temp3], %[step3_14] \n\t" 1096 "lbux %[temp1], %[temp3](%[cm]) \n\t" 1097 "sb %[temp1], 0(%[dest_pix1]) \n\t" 1098 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 1099 1100 "lbu %[temp2], 0(%[dest_pix1]) \n\t" 1101 "add %[temp2], %[temp2], %[step3_13] \n\t" 1102 "lbux %[temp0], %[temp2](%[cm]) \n\t" 1103 "sb %[temp0], 0(%[dest_pix1]) \n\t" 1104 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" 1105 "lbu %[temp3], 0(%[dest_pix1]) \n\t" 1106 "add %[temp3], %[temp3], %[step3_12] \n\t" 1107 "lbux %[temp1], %[temp3](%[cm]) \n\t" 1108 "sb %[temp1], 0(%[dest_pix1]) \n\t" 1109 1110 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 1111 [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) 1112 : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), 1113 [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), 1114 [step3_15] "r"(step3_15)); 1115 1116 input += 32; 1117 } 1118} 1119#endif // #if HAVE_DSPR2 1120