1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_config.h" 15#include "./vp9_rtcd.h" 16#include "vp9/common/vp9_common.h" 17#include "vp9/common/vp9_blockd.h" 18#include "vp9/common/vp9_idct.h" 19#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 20 21#if HAVE_DSPR2 22static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output, 23 uint32_t no_rows) { 24 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; 25 const int const_2_power_13 = 8192; 26 int Temp0, Temp1, Temp2, Temp3, Temp4; 27 int i; 28 29 for (i = no_rows; i--; ) { 30 __asm__ __volatile__ ( 31 /* 32 temp_1 = (input[0] + input[4]) * cospi_16_64; 33 step2_0 = dct_const_round_shift(temp_1); 34 35 temp_2 = (input[0] - input[4]) * cospi_16_64; 36 step2_1 = dct_const_round_shift(temp_2); 37 */ 38 "lh %[Temp0], 0(%[input]) \n\t" 39 "lh %[Temp1], 8(%[input]) \n\t" 40 "mtlo %[const_2_power_13], $ac0 \n\t" 41 "mthi $zero, $ac0 \n\t" 42 "mtlo %[const_2_power_13], $ac1 \n\t" 43 "mthi $zero, $ac1 \n\t" 44 "add %[Temp2], %[Temp0], %[Temp1] \n\t" 45 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" 46 "extp %[Temp4], $ac0, 31 \n\t" 47 48 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" 49 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" 50 "mtlo %[const_2_power_13], $ac0 \n\t" 51 "mthi $zero, $ac0 \n\t" 52 "extp %[Temp2], $ac1, 31 \n\t" 53 54 /* 55 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; 56 step2_2 = dct_const_round_shift(temp_1); 57 */ 58 "lh %[Temp0], 4(%[input]) \n\t" 59 "lh %[Temp1], 12(%[input]) \n\t" 60 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" 61 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" 62 "mtlo %[const_2_power_13], $ac1 \n\t" 63 "mthi $zero, $ac1 \n\t" 64 "extp %[Temp3], $ac0, 31 \n\t" 65 66 /* 67 step1_1 = step2_1 + step2_2; 68 step1_2 = step2_1 - step2_2; 69 */ 70 "add %[step1_1], %[Temp2], %[Temp3] \n\t" 71 "sub %[step1_2], %[Temp2], %[Temp3] \n\t" 72 73 /* 74 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; 75 step2_3 = dct_const_round_shift(temp_2); 76 */ 77 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" 78 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" 79 "extp %[Temp1], $ac1, 31 \n\t" 80 81 "mtlo %[const_2_power_13], $ac0 \n\t" 82 "mthi $zero, $ac0 \n\t" 83 84 /* 85 step1_0 = step2_0 + step2_3; 86 step1_3 = step2_0 - step2_3; 87 */ 88 "add %[step1_0], %[Temp4], %[Temp1] \n\t" 89 "sub %[step1_3], %[Temp4], %[Temp1] \n\t" 90 91 /* 92 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 93 step1_4 = dct_const_round_shift(temp_1); 94 */ 95 "lh %[Temp0], 2(%[input]) \n\t" 96 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" 97 "mtlo %[const_2_power_13], $ac1 \n\t" 98 "mthi $zero, $ac1 \n\t" 99 "lh %[Temp1], 14(%[input]) \n\t" 100 "lh %[Temp0], 2(%[input]) \n\t" 101 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" 102 "extp %[step1_4], $ac0, 31 \n\t" 103 104 /* 105 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 106 step1_7 = dct_const_round_shift(temp_2); 107 */ 108 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" 109 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" 110 "extp %[step1_7], $ac1, 31 \n\t" 111 112 /* 113 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 114 step1_5 = dct_const_round_shift(temp_1); 115 */ 116 "mtlo %[const_2_power_13], $ac0 \n\t" 117 "mthi $zero, $ac0 \n\t" 118 "lh %[Temp0], 10(%[input]) \n\t" 119 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" 120 "lh %[Temp1], 6(%[input]) \n\t" 121 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" 122 "extp %[step1_5], $ac0, 31 \n\t" 123 124 /* 125 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 126 step1_6 = dct_const_round_shift(temp_2); 127 */ 128 "mtlo %[const_2_power_13], $ac1 \n\t" 129 "mthi $zero, $ac1 \n\t" 130 "lh %[Temp0], 10(%[input]) \n\t" 131 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" 132 "lh %[Temp1], 6(%[input]) \n\t" 133 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" 134 "extp %[step1_6], $ac1, 31 \n\t" 135 136 /* 137 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; 138 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; 139 */ 140 "sub %[Temp0], %[step1_7], %[step1_6] \n\t" 141 "sub %[Temp0], %[Temp0], %[step1_4] \n\t" 142 "add %[Temp0], %[Temp0], %[step1_5] \n\t" 143 "sub %[Temp1], %[step1_4], %[step1_5] \n\t" 144 "sub %[Temp1], %[Temp1], %[step1_6] \n\t" 145 "add %[Temp1], %[Temp1], %[step1_7] \n\t" 146 147 "mtlo %[const_2_power_13], $ac0 \n\t" 148 "mthi $zero, $ac0 \n\t" 149 "mtlo %[const_2_power_13], $ac1 \n\t" 150 "mthi $zero, $ac1 \n\t" 151 152 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" 153 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" 154 155 /* 156 step1_4 = step1_4 + step1_5; 157 step1_7 = step1_6 + step1_7; 158 */ 159 "add %[step1_4], %[step1_4], %[step1_5] \n\t" 160 "add %[step1_7], %[step1_7], %[step1_6] \n\t" 161 162 "extp %[step1_5], $ac0, 31 \n\t" 163 "extp %[step1_6], $ac1, 31 \n\t" 164 165 "add %[Temp0], %[step1_0], %[step1_7] \n\t" 166 "sh %[Temp0], 0(%[output]) \n\t" 167 "add %[Temp1], %[step1_1], %[step1_6] \n\t" 168 "sh %[Temp1], 16(%[output]) \n\t" 169 "add %[Temp0], %[step1_2], %[step1_5] \n\t" 170 "sh %[Temp0], 32(%[output]) \n\t" 171 "add %[Temp1], %[step1_3], %[step1_4] \n\t" 172 "sh %[Temp1], 48(%[output]) \n\t" 173 174 "sub %[Temp0], %[step1_3], %[step1_4] \n\t" 175 "sh %[Temp0], 64(%[output]) \n\t" 176 "sub %[Temp1], %[step1_2], %[step1_5] \n\t" 177 "sh %[Temp1], 80(%[output]) \n\t" 178 "sub %[Temp0], %[step1_1], %[step1_6] \n\t" 179 "sh %[Temp0], 96(%[output]) \n\t" 180 "sub %[Temp1], %[step1_0], %[step1_7] \n\t" 181 "sh %[Temp1], 112(%[output]) \n\t" 182 183 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), 184 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), 185 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), 186 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), 187 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), 188 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 189 [Temp4] "=&r" (Temp4) 190 : [const_2_power_13] "r" (const_2_power_13), 191 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), 192 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), 193 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), 194 [cospi_24_64] "r" (cospi_24_64), 195 [output] "r" (output), [input] "r" (input) 196 ); 197 198 input += 8; 199 output += 1; 200 } 201} 202 203static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, 204 int dest_stride) { 205 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; 206 int Temp0, Temp1, Temp2, Temp3; 207 int i; 208 const int const_2_power_13 = 8192; 209 uint8_t *dest_pix; 210 uint8_t *cm = vp9_ff_cropTbl; 211 212 /* prefetch vp9_ff_cropTbl */ 213 vp9_prefetch_load(vp9_ff_cropTbl); 214 vp9_prefetch_load(vp9_ff_cropTbl + 32); 215 vp9_prefetch_load(vp9_ff_cropTbl + 64); 216 vp9_prefetch_load(vp9_ff_cropTbl + 96); 217 vp9_prefetch_load(vp9_ff_cropTbl + 128); 218 vp9_prefetch_load(vp9_ff_cropTbl + 160); 219 vp9_prefetch_load(vp9_ff_cropTbl + 192); 220 vp9_prefetch_load(vp9_ff_cropTbl + 224); 221 222 for (i = 0; i < 8; ++i) { 223 dest_pix = (dest + i); 224 225 __asm__ __volatile__ ( 226 /* 227 temp_1 = (input[0] + input[4]) * cospi_16_64; 228 step2_0 = dct_const_round_shift(temp_1); 229 230 temp_2 = (input[0] - input[4]) * cospi_16_64; 231 step2_1 = dct_const_round_shift(temp_2); 232 */ 233 "lh %[Temp0], 0(%[input]) \n\t" 234 "lh %[Temp1], 8(%[input]) \n\t" 235 "mtlo %[const_2_power_13], $ac0 \n\t" 236 "mthi $zero, $ac0 \n\t" 237 "mtlo %[const_2_power_13], $ac1 \n\t" 238 "mthi $zero, $ac1 \n\t" 239 "add %[Temp2], %[Temp0], %[Temp1] \n\t" 240 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" 241 "extp %[step1_6], $ac0, 31 \n\t" 242 243 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" 244 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" 245 "mtlo %[const_2_power_13], $ac0 \n\t" 246 "mthi $zero, $ac0 \n\t" 247 "extp %[Temp2], $ac1, 31 \n\t" 248 249 /* 250 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; 251 step2_2 = dct_const_round_shift(temp_1); 252 */ 253 "lh %[Temp0], 4(%[input]) \n\t" 254 "lh %[Temp1], 12(%[input]) \n\t" 255 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" 256 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" 257 "mtlo %[const_2_power_13], $ac1 \n\t" 258 "mthi $zero, $ac1 \n\t" 259 "extp %[Temp3], $ac0, 31 \n\t" 260 261 /* 262 step1_1 = step2_1 + step2_2; 263 step1_2 = step2_1 - step2_2; 264 */ 265 "add %[step1_1], %[Temp2], %[Temp3] \n\t" 266 "sub %[step1_2], %[Temp2], %[Temp3] \n\t" 267 268 /* 269 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; 270 step2_3 = dct_const_round_shift(temp_2); 271 */ 272 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" 273 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" 274 "extp %[Temp1], $ac1, 31 \n\t" 275 276 "mtlo %[const_2_power_13], $ac0 \n\t" 277 "mthi $zero, $ac0 \n\t" 278 279 /* 280 step1_0 = step2_0 + step2_3; 281 step1_3 = step2_0 - step2_3; 282 */ 283 "add %[step1_0], %[step1_6], %[Temp1] \n\t" 284 "sub %[step1_3], %[step1_6], %[Temp1] \n\t" 285 286 /* 287 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 288 step1_4 = dct_const_round_shift(temp_1); 289 */ 290 "lh %[Temp0], 2(%[input]) \n\t" 291 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" 292 "mtlo %[const_2_power_13], $ac1 \n\t" 293 "mthi $zero, $ac1 \n\t" 294 "lh %[Temp1], 14(%[input]) \n\t" 295 "lh %[Temp0], 2(%[input]) \n\t" 296 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" 297 "extp %[step1_4], $ac0, 31 \n\t" 298 299 /* 300 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 301 step1_7 = dct_const_round_shift(temp_2); 302 */ 303 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" 304 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" 305 "extp %[step1_7], $ac1, 31 \n\t" 306 307 /* 308 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 309 step1_5 = dct_const_round_shift(temp_1); 310 */ 311 "mtlo %[const_2_power_13], $ac0 \n\t" 312 "mthi $zero, $ac0 \n\t" 313 "lh %[Temp0], 10(%[input]) \n\t" 314 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" 315 "lh %[Temp1], 6(%[input]) \n\t" 316 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" 317 "extp %[step1_5], $ac0, 31 \n\t" 318 319 /* 320 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 321 step1_6 = dct_const_round_shift(temp_2); 322 */ 323 "mtlo %[const_2_power_13], $ac1 \n\t" 324 "mthi $zero, $ac1 \n\t" 325 "lh %[Temp0], 10(%[input]) \n\t" 326 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" 327 "lh %[Temp1], 6(%[input]) \n\t" 328 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" 329 "extp %[step1_6], $ac1, 31 \n\t" 330 331 /* 332 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; 333 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; 334 */ 335 "sub %[Temp0], %[step1_7], %[step1_6] \n\t" 336 "sub %[Temp0], %[Temp0], %[step1_4] \n\t" 337 "add %[Temp0], %[Temp0], %[step1_5] \n\t" 338 "sub %[Temp1], %[step1_4], %[step1_5] \n\t" 339 "sub %[Temp1], %[Temp1], %[step1_6] \n\t" 340 "add %[Temp1], %[Temp1], %[step1_7] \n\t" 341 342 "mtlo %[const_2_power_13], $ac0 \n\t" 343 "mthi $zero, $ac0 \n\t" 344 "mtlo %[const_2_power_13], $ac1 \n\t" 345 "mthi $zero, $ac1 \n\t" 346 347 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" 348 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" 349 350 /* 351 step1_4 = step1_4 + step1_5; 352 step1_7 = step1_6 + step1_7; 353 */ 354 "add %[step1_4], %[step1_4], %[step1_5] \n\t" 355 "add %[step1_7], %[step1_7], %[step1_6] \n\t" 356 357 "extp %[step1_5], $ac0, 31 \n\t" 358 "extp %[step1_6], $ac1, 31 \n\t" 359 360 /* add block */ 361 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 362 "add %[Temp0], %[step1_0], %[step1_7] \n\t" 363 "addi %[Temp0], %[Temp0], 16 \n\t" 364 "sra %[Temp0], %[Temp0], 5 \n\t" 365 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 366 "add %[Temp0], %[step1_1], %[step1_6] \n\t" 367 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 368 "sb %[Temp2], 0(%[dest_pix]) \n\t" 369 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 370 371 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 372 "addi %[Temp0], %[Temp0], 16 \n\t" 373 "sra %[Temp0], %[Temp0], 5 \n\t" 374 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 375 "add %[Temp0], %[step1_2], %[step1_5] \n\t" 376 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 377 "sb %[Temp2], 0(%[dest_pix]) \n\t" 378 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 379 380 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 381 "addi %[Temp0], %[Temp0], 16 \n\t" 382 "sra %[Temp0], %[Temp0], 5 \n\t" 383 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 384 "add %[Temp0], %[step1_3], %[step1_4] \n\t" 385 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 386 "sb %[Temp2], 0(%[dest_pix]) \n\t" 387 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 388 389 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 390 "addi %[Temp0], %[Temp0], 16 \n\t" 391 "sra %[Temp0], %[Temp0], 5 \n\t" 392 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 393 "sub %[Temp0], %[step1_3], %[step1_4] \n\t" 394 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 395 "sb %[Temp2], 0(%[dest_pix]) \n\t" 396 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 397 398 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 399 "addi %[Temp0], %[Temp0], 16 \n\t" 400 "sra %[Temp0], %[Temp0], 5 \n\t" 401 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 402 "sub %[Temp0], %[step1_2], %[step1_5] \n\t" 403 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 404 "sb %[Temp2], 0(%[dest_pix]) \n\t" 405 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 406 407 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 408 "addi %[Temp0], %[Temp0], 16 \n\t" 409 "sra %[Temp0], %[Temp0], 5 \n\t" 410 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 411 "sub %[Temp0], %[step1_1], %[step1_6] \n\t" 412 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 413 "sb %[Temp2], 0(%[dest_pix]) \n\t" 414 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 415 416 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 417 "addi %[Temp0], %[Temp0], 16 \n\t" 418 "sra %[Temp0], %[Temp0], 5 \n\t" 419 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 420 "sub %[Temp0], %[step1_0], %[step1_7] \n\t" 421 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 422 "sb %[Temp2], 0(%[dest_pix]) \n\t" 423 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 424 425 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 426 "addi %[Temp0], %[Temp0], 16 \n\t" 427 "sra %[Temp0], %[Temp0], 5 \n\t" 428 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 429 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 430 "sb %[Temp2], 0(%[dest_pix]) \n\t" 431 432 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), 433 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), 434 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), 435 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), 436 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), 437 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 438 [dest_pix] "+r" (dest_pix) 439 : [const_2_power_13] "r" (const_2_power_13), 440 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), 441 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), 442 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), 443 [cospi_24_64] "r" (cospi_24_64), 444 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) 445 ); 446 447 input += 8; 448 } 449} 450 451void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, 452 int dest_stride) { 453 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); 454 int16_t *outptr = out; 455 uint32_t pos = 45; 456 457 /* bit positon for extract from acc */ 458 __asm__ __volatile__ ( 459 "wrdsp %[pos], 1 \n\t" 460 : 461 : [pos] "r" (pos) 462 ); 463 464 // First transform rows 465 idct8_1d_rows_dspr2(input, outptr, 8); 466 467 // Then transform columns and add to dest 468 idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); 469} 470 471static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) { 472 int s0, s1, s2, s3, s4, s5, s6, s7; 473 int x0, x1, x2, x3, x4, x5, x6, x7; 474 475 x0 = input[7]; 476 x1 = input[0]; 477 x2 = input[5]; 478 x3 = input[2]; 479 x4 = input[3]; 480 x5 = input[4]; 481 x6 = input[1]; 482 x7 = input[6]; 483 484 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 485 output[0] = output[1] = output[2] = output[3] = output[4] 486 = output[5] = output[6] = output[7] = 0; 487 return; 488 } 489 490 // stage 1 491 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 492 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 493 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 494 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 495 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 496 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 497 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 498 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 499 500 x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); 501 x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); 502 x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); 503 x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); 504 x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); 505 x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); 506 x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); 507 x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); 508 509 // stage 2 510 s0 = x0; 511 s1 = x1; 512 s2 = x2; 513 s3 = x3; 514 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 515 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 516 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 517 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 518 519 x0 = s0 + s2; 520 x1 = s1 + s3; 521 x2 = s0 - s2; 522 x3 = s1 - s3; 523 x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); 524 x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); 525 x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); 526 x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); 527 528 // stage 3 529 s2 = cospi_16_64 * (x2 + x3); 530 s3 = cospi_16_64 * (x2 - x3); 531 s6 = cospi_16_64 * (x6 + x7); 532 s7 = cospi_16_64 * (x6 - x7); 533 534 x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); 535 x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); 536 x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); 537 x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); 538 539 output[0] = x0; 540 output[1] = -x4; 541 output[2] = x6; 542 output[3] = -x2; 543 output[4] = x3; 544 output[5] = -x7; 545 output[6] = x5; 546 output[7] = -x1; 547} 548 549void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, 550 int dest_stride, int tx_type) { 551 int i, j; 552 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); 553 int16_t *outptr = out; 554 int16_t temp_in[8 * 8], temp_out[8]; 555 uint32_t pos = 45; 556 557 /* bit positon for extract from acc */ 558 __asm__ __volatile__ ( 559 "wrdsp %[pos], 1 \n\t" 560 : 561 : [pos] "r" (pos) 562 ); 563 564 switch (tx_type) { 565 case DCT_DCT: // DCT in both horizontal and vertical 566 idct8_1d_rows_dspr2(input, outptr, 8); 567 idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); 568 break; 569 case ADST_DCT: // ADST in vertical, DCT in horizontal 570 idct8_1d_rows_dspr2(input, outptr, 8); 571 572 for (i = 0; i < 8; ++i) { 573 iadst8_1d_dspr2(&out[i * 8], temp_out); 574 575 for (j = 0; j < 8; ++j) 576 dest[j * dest_stride + i] = 577 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 578 + dest[j * dest_stride + i]); 579 } 580 break; 581 case DCT_ADST: // DCT in vertical, ADST in horizontal 582 for (i = 0; i < 8; ++i) { 583 iadst8_1d_dspr2(input, outptr); 584 input += 8; 585 outptr += 8; 586 } 587 588 for (i = 0; i < 8; ++i) { 589 for (j = 0; j < 8; ++j) { 590 temp_in[i * 8 + j] = out[j * 8 + i]; 591 } 592 } 593 idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); 594 break; 595 case ADST_ADST: // ADST in both directions 596 for (i = 0; i < 8; ++i) { 597 iadst8_1d_dspr2(input, outptr); 598 input += 8; 599 outptr += 8; 600 } 601 602 for (i = 0; i < 8; ++i) { 603 for (j = 0; j < 8; ++j) 604 temp_in[j] = out[j * 8 + i]; 605 606 iadst8_1d_dspr2(temp_in, temp_out); 607 608 for (j = 0; j < 8; ++j) 609 dest[j * dest_stride + i] = 610 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 611 + dest[j * dest_stride + i]); 612 } 613 break; 614 default: 615 printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); 616 break; 617 } 618} 619 620void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, 621 int dest_stride) { 622 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); 623 int16_t *outptr = out; 624 uint32_t pos = 45; 625 626 /* bit positon for extract from acc */ 627 __asm__ __volatile__ ( 628 "wrdsp %[pos], 1 \n\t" 629 : 630 : [pos] "r" (pos) 631 ); 632 633 // First transform rows 634 idct8_1d_rows_dspr2(input, outptr, 4); 635 636 outptr += 4; 637 638 __asm__ __volatile__ ( 639 "sw $zero, 0(%[outptr]) \n\t" 640 "sw $zero, 4(%[outptr]) \n\t" 641 "sw $zero, 16(%[outptr]) \n\t" 642 "sw $zero, 20(%[outptr]) \n\t" 643 "sw $zero, 32(%[outptr]) \n\t" 644 "sw $zero, 36(%[outptr]) \n\t" 645 "sw $zero, 48(%[outptr]) \n\t" 646 "sw $zero, 52(%[outptr]) \n\t" 647 "sw $zero, 64(%[outptr]) \n\t" 648 "sw $zero, 68(%[outptr]) \n\t" 649 "sw $zero, 80(%[outptr]) \n\t" 650 "sw $zero, 84(%[outptr]) \n\t" 651 "sw $zero, 96(%[outptr]) \n\t" 652 "sw $zero, 100(%[outptr]) \n\t" 653 "sw $zero, 112(%[outptr]) \n\t" 654 "sw $zero, 116(%[outptr]) \n\t" 655 656 : 657 : [outptr] "r" (outptr) 658 ); 659 660 661 // Then transform columns and add to dest 662 idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); 663} 664 665void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, 666 int dest_stride) { 667 uint32_t pos = 45; 668 int32_t out; 669 int32_t r; 670 int32_t a1, absa1; 671 int32_t t1, t2, vector_a1, vector_1, vector_2; 672 673 /* bit positon for extract from acc */ 674 __asm__ __volatile__ ( 675 "wrdsp %[pos], 1 \n\t" 676 677 : 678 : [pos] "r" (pos) 679 ); 680 681 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); 682 __asm__ __volatile__ ( 683 "addi %[out], %[out], 16 \n\t" 684 "sra %[a1], %[out], 5 \n\t" 685 686 : [out] "+r" (out), [a1] "=r" (a1) 687 : 688 ); 689 690 if (a1 < 0) { 691 /* use quad-byte 692 * input and output memory are four byte aligned */ 693 __asm__ __volatile__ ( 694 "abs %[absa1], %[a1] \n\t" 695 "replv.qb %[vector_a1], %[absa1] \n\t" 696 697 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) 698 : [a1] "r" (a1) 699 ); 700 701 for (r = 8; r--;) { 702 __asm__ __volatile__ ( 703 "lw %[t1], 0(%[dest]) \n\t" 704 "lw %[t2], 4(%[dest]) \n\t" 705 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 706 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 707 "sw %[vector_1], 0(%[dest]) \n\t" 708 "sw %[vector_2], 4(%[dest]) \n\t" 709 "add %[dest], %[dest], %[dest_stride] \n\t" 710 711 : [t1] "=&r" (t1), [t2] "=&r" (t2), 712 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), 713 [dest] "+&r" (dest) 714 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) 715 ); 716 } 717 } else { 718 /* use quad-byte 719 * input and output memory are four byte aligned */ 720 __asm__ __volatile__ ( 721 "replv.qb %[vector_a1], %[a1] \n\t" 722 723 : [vector_a1] "=r" (vector_a1) 724 : [a1] "r" (a1) 725 ); 726 727 for (r = 8; r--;) { 728 __asm__ __volatile__ ( 729 "lw %[t1], 0(%[dest]) \n\t" 730 "lw %[t2], 4(%[dest]) \n\t" 731 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 732 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 733 "sw %[vector_1], 0(%[dest]) \n\t" 734 "sw %[vector_2], 4(%[dest]) \n\t" 735 "add %[dest], %[dest], %[dest_stride] \n\t" 736 737 : [t1] "=&r" (t1), [t2] "=&r" (t2), 738 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), 739 [dest] "+r" (dest) 740 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) 741 ); 742 } 743 } 744} 745#endif // #if HAVE_DSPR2 746