1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_config.h" 15#include "./vp9_rtcd.h" 16#include "vp9/common/vp9_common.h" 17#include "vp9/common/vp9_blockd.h" 18#include "vp9/common/vp9_idct.h" 19#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 20 21#if HAVE_DSPR2 22static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) { 23 int16_t step_0, step_1, step_2, step_3; 24 int Temp0, Temp1, Temp2, Temp3; 25 const int const_2_power_13 = 8192; 26 int i; 27 28 for (i = 4; i--; ) { 29 __asm__ __volatile__ ( 30 /* 31 temp_1 = (input[0] + input[2]) * cospi_16_64; 32 step_0 = dct_const_round_shift(temp_1); 33 34 temp_2 = (input[0] - input[2]) * cospi_16_64; 35 step_1 = dct_const_round_shift(temp_2); 36 */ 37 "lh %[Temp0], 0(%[input]) \n\t" 38 "lh %[Temp1], 4(%[input]) \n\t" 39 "mtlo %[const_2_power_13], $ac0 \n\t" 40 "mthi $zero, $ac0 \n\t" 41 "mtlo %[const_2_power_13], $ac1 \n\t" 42 "mthi $zero, $ac1 \n\t" 43 "add %[Temp2], %[Temp0], %[Temp1] \n\t" 44 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" 45 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" 46 "lh %[Temp0], 2(%[input]) \n\t" 47 "lh %[Temp1], 6(%[input]) \n\t" 48 "extp %[step_0], $ac0, 31 \n\t" 49 "mtlo %[const_2_power_13], $ac0 \n\t" 50 "mthi $zero, $ac0 \n\t" 51 52 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" 53 "extp %[step_1], $ac1, 31 \n\t" 54 "mtlo %[const_2_power_13], $ac1 \n\t" 55 "mthi $zero, $ac1 \n\t" 56 57 /* 58 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 59 step_2 = dct_const_round_shift(temp1); 60 */ 61 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" 62 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" 63 "extp %[step_2], $ac0, 31 \n\t" 64 65 /* 66 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 67 step_3 = dct_const_round_shift(temp2); 68 */ 69 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" 70 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" 71 "extp %[step_3], $ac1, 31 \n\t" 72 73 /* 74 output[0] = step_0 + step_3; 75 output[4] = step_1 + step_2; 76 output[8] = step_1 - step_2; 77 output[12] = step_0 - step_3; 78 */ 79 "add %[Temp0], %[step_0], %[step_3] \n\t" 80 "sh %[Temp0], 0(%[output]) \n\t" 81 82 "add %[Temp1], %[step_1], %[step_2] \n\t" 83 "sh %[Temp1], 8(%[output]) \n\t" 84 85 "sub %[Temp2], %[step_1], %[step_2] \n\t" 86 "sh %[Temp2], 16(%[output]) \n\t" 87 88 "sub %[Temp3], %[step_0], %[step_3] \n\t" 89 "sh %[Temp3], 24(%[output]) \n\t" 90 91 : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), 92 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 93 [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), 94 [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), 95 [output] "+r" (output) 96 : [const_2_power_13] "r" (const_2_power_13), 97 [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), 98 [cospi_24_64] "r" (cospi_24_64), 99 [input] "r" (input) 100 ); 101 102 input += 4; 103 output += 1; 104 } 105} 106 107static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, 108 int dest_stride) { 109 int16_t step_0, step_1, step_2, step_3; 110 int Temp0, Temp1, Temp2, Temp3; 111 const int const_2_power_13 = 8192; 112 int i; 113 uint8_t *dest_pix; 114 uint8_t *cm = vp9_ff_cropTbl; 115 116 /* prefetch vp9_ff_cropTbl */ 117 vp9_prefetch_load(vp9_ff_cropTbl); 118 vp9_prefetch_load(vp9_ff_cropTbl + 32); 119 vp9_prefetch_load(vp9_ff_cropTbl + 64); 120 vp9_prefetch_load(vp9_ff_cropTbl + 96); 121 vp9_prefetch_load(vp9_ff_cropTbl + 128); 122 vp9_prefetch_load(vp9_ff_cropTbl + 160); 123 vp9_prefetch_load(vp9_ff_cropTbl + 192); 124 vp9_prefetch_load(vp9_ff_cropTbl + 224); 125 126 for (i = 0; i < 4; ++i) { 127 dest_pix = (dest + i); 128 129 __asm__ __volatile__ ( 130 /* 131 temp_1 = (input[0] + input[2]) * cospi_16_64; 132 step_0 = dct_const_round_shift(temp_1); 133 134 temp_2 = (input[0] - input[2]) * cospi_16_64; 135 step_1 = dct_const_round_shift(temp_2); 136 */ 137 "lh %[Temp0], 0(%[input]) \n\t" 138 "lh %[Temp1], 4(%[input]) \n\t" 139 "mtlo %[const_2_power_13], $ac0 \n\t" 140 "mthi $zero, $ac0 \n\t" 141 "mtlo %[const_2_power_13], $ac1 \n\t" 142 "mthi $zero, $ac1 \n\t" 143 "add %[Temp2], %[Temp0], %[Temp1] \n\t" 144 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" 145 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" 146 "lh %[Temp0], 2(%[input]) \n\t" 147 "lh %[Temp1], 6(%[input]) \n\t" 148 "extp %[step_0], $ac0, 31 \n\t" 149 "mtlo %[const_2_power_13], $ac0 \n\t" 150 "mthi $zero, $ac0 \n\t" 151 152 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" 153 "extp %[step_1], $ac1, 31 \n\t" 154 "mtlo %[const_2_power_13], $ac1 \n\t" 155 "mthi $zero, $ac1 \n\t" 156 157 /* 158 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 159 step_2 = dct_const_round_shift(temp1); 160 */ 161 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" 162 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" 163 "extp %[step_2], $ac0, 31 \n\t" 164 165 /* 166 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 167 step_3 = dct_const_round_shift(temp2); 168 */ 169 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" 170 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" 171 "extp %[step_3], $ac1, 31 \n\t" 172 173 /* 174 output[0] = step_0 + step_3; 175 output[4] = step_1 + step_2; 176 output[8] = step_1 - step_2; 177 output[12] = step_0 - step_3; 178 */ 179 "add %[Temp0], %[step_0], %[step_3] \n\t" 180 "addi %[Temp0], %[Temp0], 8 \n\t" 181 "sra %[Temp0], %[Temp0], 4 \n\t" 182 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 183 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 184 "add %[Temp0], %[step_1], %[step_2] \n\t" 185 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 186 "sb %[Temp2], 0(%[dest_pix]) \n\t" 187 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 188 189 "addi %[Temp0], %[Temp0], 8 \n\t" 190 "sra %[Temp0], %[Temp0], 4 \n\t" 191 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 192 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 193 "sub %[Temp0], %[step_1], %[step_2] \n\t" 194 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 195 "sb %[Temp2], 0(%[dest_pix]) \n\t" 196 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 197 198 "addi %[Temp0], %[Temp0], 8 \n\t" 199 "sra %[Temp0], %[Temp0], 4 \n\t" 200 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 201 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 202 "sub %[Temp0], %[step_0], %[step_3] \n\t" 203 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 204 "sb %[Temp2], 0(%[dest_pix]) \n\t" 205 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 206 207 "addi %[Temp0], %[Temp0], 8 \n\t" 208 "sra %[Temp0], %[Temp0], 4 \n\t" 209 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 210 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 211 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 212 "sb %[Temp2], 0(%[dest_pix]) \n\t" 213 214 : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), 215 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 216 [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), 217 [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), 218 [dest_pix] "+r" (dest_pix) 219 : [const_2_power_13] "r" (const_2_power_13), 220 [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), 221 [cospi_24_64] "r" (cospi_24_64), 222 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) 223 ); 224 225 input += 4; 226 } 227} 228 229void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, 230 int dest_stride) { 231 DECLARE_ALIGNED(32, int16_t, out[4 * 4]); 232 int16_t *outptr = out; 233 uint32_t pos = 45; 234 235 /* bit positon for extract from acc */ 236 __asm__ __volatile__ ( 237 "wrdsp %[pos], 1 \n\t" 238 : 239 : [pos] "r" (pos) 240 ); 241 242 // Rows 243 vp9_idct4_rows_dspr2(input, outptr); 244 245 // Columns 246 vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); 247} 248 249void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, 250 int dest_stride) { 251 int a1, absa1; 252 int r; 253 int32_t out; 254 int t2, vector_a1, vector_a; 255 uint32_t pos = 45; 256 int16_t input_dc = input[0]; 257 258 /* bit positon for extract from acc */ 259 __asm__ __volatile__ ( 260 "wrdsp %[pos], 1 \n\t" 261 262 : 263 : [pos] "r" (pos) 264 ); 265 266 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); 267 __asm__ __volatile__ ( 268 "addi %[out], %[out], 8 \n\t" 269 "sra %[a1], %[out], 4 \n\t" 270 271 : [out] "+r" (out), [a1] "=r" (a1) 272 : 273 ); 274 275 if (a1 < 0) { 276 /* use quad-byte 277 * input and output memory are four byte aligned */ 278 __asm__ __volatile__ ( 279 "abs %[absa1], %[a1] \n\t" 280 "replv.qb %[vector_a1], %[absa1] \n\t" 281 282 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) 283 : [a1] "r" (a1) 284 ); 285 286 for (r = 4; r--;) { 287 __asm__ __volatile__ ( 288 "lw %[t2], 0(%[dest]) \n\t" 289 "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" 290 "sw %[vector_a], 0(%[dest]) \n\t" 291 "add %[dest], %[dest], %[dest_stride] \n\t" 292 293 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), 294 [dest] "+&r" (dest) 295 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) 296 ); 297 } 298 } else { 299 /* use quad-byte 300 * input and output memory are four byte aligned */ 301 __asm__ __volatile__ ( 302 "replv.qb %[vector_a1], %[a1] \n\t" 303 : [vector_a1] "=r" (vector_a1) 304 : [a1] "r" (a1) 305 ); 306 307 for (r = 4; r--;) { 308 __asm__ __volatile__ ( 309 "lw %[t2], 0(%[dest]) \n\t" 310 "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" 311 "sw %[vector_a], 0(%[dest]) \n\t" 312 "add %[dest], %[dest], %[dest_stride] \n\t" 313 314 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), 315 [dest] "+&r" (dest) 316 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) 317 ); 318 } 319 } 320} 321 322static void iadst4_dspr2(const int16_t *input, int16_t *output) { 323 int s0, s1, s2, s3, s4, s5, s6, s7; 324 int x0, x1, x2, x3; 325 326 x0 = input[0]; 327 x1 = input[1]; 328 x2 = input[2]; 329 x3 = input[3]; 330 331 if (!(x0 | x1 | x2 | x3)) { 332 output[0] = output[1] = output[2] = output[3] = 0; 333 return; 334 } 335 336 s0 = sinpi_1_9 * x0; 337 s1 = sinpi_2_9 * x0; 338 s2 = sinpi_3_9 * x1; 339 s3 = sinpi_4_9 * x2; 340 s4 = sinpi_1_9 * x2; 341 s5 = sinpi_2_9 * x3; 342 s6 = sinpi_4_9 * x3; 343 s7 = x0 - x2 + x3; 344 345 x0 = s0 + s3 + s5; 346 x1 = s1 - s4 - s6; 347 x2 = sinpi_3_9 * s7; 348 x3 = s2; 349 350 s0 = x0 + x3; 351 s1 = x1 + x3; 352 s2 = x2; 353 s3 = x0 + x1 - x3; 354 355 // 1-D transform scaling factor is sqrt(2). 356 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 357 // + 1b (addition) = 29b. 358 // Hence the output bit depth is 15b. 359 output[0] = dct_const_round_shift(s0); 360 output[1] = dct_const_round_shift(s1); 361 output[2] = dct_const_round_shift(s2); 362 output[3] = dct_const_round_shift(s3); 363} 364 365void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, 366 int dest_stride, int tx_type) { 367 int i, j; 368 DECLARE_ALIGNED(32, int16_t, out[4 * 4]); 369 int16_t *outptr = out; 370 int16_t temp_in[4 * 4], temp_out[4]; 371 uint32_t pos = 45; 372 373 /* bit positon for extract from acc */ 374 __asm__ __volatile__ ( 375 "wrdsp %[pos], 1 \n\t" 376 : 377 : [pos] "r" (pos) 378 ); 379 380 switch (tx_type) { 381 case DCT_DCT: // DCT in both horizontal and vertical 382 vp9_idct4_rows_dspr2(input, outptr); 383 vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); 384 break; 385 case ADST_DCT: // ADST in vertical, DCT in horizontal 386 vp9_idct4_rows_dspr2(input, outptr); 387 388 outptr = out; 389 390 for (i = 0; i < 4; ++i) { 391 iadst4_dspr2(outptr, temp_out); 392 393 for (j = 0; j < 4; ++j) 394 dest[j * dest_stride + i] = 395 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 396 + dest[j * dest_stride + i]); 397 398 outptr += 4; 399 } 400 break; 401 case DCT_ADST: // DCT in vertical, ADST in horizontal 402 for (i = 0; i < 4; ++i) { 403 iadst4_dspr2(input, outptr); 404 input += 4; 405 outptr += 4; 406 } 407 408 for (i = 0; i < 4; ++i) { 409 for (j = 0; j < 4; ++j) { 410 temp_in[i * 4 + j] = out[j * 4 + i]; 411 } 412 } 413 vp9_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); 414 break; 415 case ADST_ADST: // ADST in both directions 416 for (i = 0; i < 4; ++i) { 417 iadst4_dspr2(input, outptr); 418 input += 4; 419 outptr += 4; 420 } 421 422 for (i = 0; i < 4; ++i) { 423 for (j = 0; j < 4; ++j) 424 temp_in[j] = out[j * 4 + i]; 425 iadst4_dspr2(temp_in, temp_out); 426 427 for (j = 0; j < 4; ++j) 428 dest[j * dest_stride + i] = 429 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 430 + dest[j * dest_stride + i]); 431 } 432 break; 433 default: 434 printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); 435 break; 436 } 437} 438#endif // #if HAVE_DSPR2 439