1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <math.h> 13 14#include "./vpx_config.h" 15#include "./vp9_rtcd.h" 16 17#include "vp9/common/vp9_blockd.h" 18#include "vp9/common/vp9_idct.h" 19#include "vp9/common/vp9_systemdependent.h" 20 21static INLINE tran_high_t fdct_round_shift(tran_high_t input) { 22 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 23 // TODO(debargha, peter.derivaz): Find new bounds for this assert 24 // and make the bounds consts. 25 // assert(INT16_MIN <= rv && rv <= INT16_MAX); 26 return rv; 27} 28 29static void fdct4(const tran_low_t *input, tran_low_t *output) { 30 tran_high_t step[4]; 31 tran_high_t temp1, temp2; 32 33 step[0] = input[0] + input[3]; 34 step[1] = input[1] + input[2]; 35 step[2] = input[1] - input[2]; 36 step[3] = input[0] - input[3]; 37 38 temp1 = (step[0] + step[1]) * cospi_16_64; 39 temp2 = (step[0] - step[1]) * cospi_16_64; 40 output[0] = fdct_round_shift(temp1); 41 output[2] = fdct_round_shift(temp2); 42 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 43 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 44 output[1] = fdct_round_shift(temp1); 45 output[3] = fdct_round_shift(temp2); 46} 47 48void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { 49 int r, c; 50 tran_low_t sum = 0; 51 for (r = 0; r < 4; ++r) 52 for (c = 0; c < 4; ++c) 53 sum += input[r * stride + c]; 54 55 output[0] = sum << 1; 56 output[1] = 0; 57} 58 59void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { 60 // The 2D transform is done with two passes which are actually pretty 61 // similar. In the first one, we transform the columns and transpose 62 // the results. In the second one, we transform the rows. To achieve that, 63 // as the first pass results are transposed, we transpose the columns (that 64 // is the transposed rows) and transpose the results (so that it goes back 65 // in normal/row positions). 66 int pass; 67 // We need an intermediate buffer between passes. 68 tran_low_t intermediate[4 * 4]; 69 const int16_t *in_pass0 = input; 70 const tran_low_t *in = NULL; 71 tran_low_t *out = intermediate; 72 // Do the two transform/transpose passes 73 for (pass = 0; pass < 2; ++pass) { 74 tran_high_t input[4]; // canbe16 75 tran_high_t step[4]; // canbe16 76 tran_high_t temp1, temp2; // needs32 77 int i; 78 for (i = 0; i < 4; ++i) { 79 // Load inputs. 80 if (0 == pass) { 81 input[0] = in_pass0[0 * stride] * 16; 82 input[1] = in_pass0[1 * stride] * 16; 83 input[2] = in_pass0[2 * stride] * 16; 84 input[3] = in_pass0[3 * stride] * 16; 85 if (i == 0 && input[0]) { 86 input[0] += 1; 87 } 88 } else { 89 input[0] = in[0 * 4]; 90 input[1] = in[1 * 4]; 91 input[2] = in[2 * 4]; 92 input[3] = in[3 * 4]; 93 } 94 // Transform. 95 step[0] = input[0] + input[3]; 96 step[1] = input[1] + input[2]; 97 step[2] = input[1] - input[2]; 98 step[3] = input[0] - input[3]; 99 temp1 = (step[0] + step[1]) * cospi_16_64; 100 temp2 = (step[0] - step[1]) * cospi_16_64; 101 out[0] = fdct_round_shift(temp1); 102 out[2] = fdct_round_shift(temp2); 103 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 104 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 105 out[1] = fdct_round_shift(temp1); 106 out[3] = fdct_round_shift(temp2); 107 // Do next column (which is a transposed row in second/horizontal pass) 108 in_pass0++; 109 in++; 110 out += 4; 111 } 112 // Setup in/out for next pass. 113 in = intermediate; 114 out = output; 115 } 116 117 { 118 int i, j; 119 for (i = 0; i < 4; ++i) { 120 for (j = 0; j < 4; ++j) 121 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; 122 } 123 } 124} 125 126static void fadst4(const tran_low_t *input, tran_low_t *output) { 127 tran_high_t x0, x1, x2, x3; 128 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 129 130 x0 = input[0]; 131 x1 = input[1]; 132 x2 = input[2]; 133 x3 = input[3]; 134 135 if (!(x0 | x1 | x2 | x3)) { 136 output[0] = output[1] = output[2] = output[3] = 0; 137 return; 138 } 139 140 s0 = sinpi_1_9 * x0; 141 s1 = sinpi_4_9 * x0; 142 s2 = sinpi_2_9 * x1; 143 s3 = sinpi_1_9 * x1; 144 s4 = sinpi_3_9 * x2; 145 s5 = sinpi_4_9 * x3; 146 s6 = sinpi_2_9 * x3; 147 s7 = x0 + x1 - x3; 148 149 x0 = s0 + s2 + s5; 150 x1 = sinpi_3_9 * s7; 151 x2 = s1 - s3 + s6; 152 x3 = s4; 153 154 s0 = x0 + x3; 155 s1 = x1; 156 s2 = x2 - x3; 157 s3 = x2 - x0 + x3; 158 159 // 1-D transform scaling factor is sqrt(2). 160 output[0] = fdct_round_shift(s0); 161 output[1] = fdct_round_shift(s1); 162 output[2] = fdct_round_shift(s2); 163 output[3] = fdct_round_shift(s3); 164} 165 166static const transform_2d FHT_4[] = { 167 { fdct4, fdct4 }, // DCT_DCT = 0 168 { fadst4, fdct4 }, // ADST_DCT = 1 169 { fdct4, fadst4 }, // DCT_ADST = 2 170 { fadst4, fadst4 } // ADST_ADST = 3 171}; 172 173void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, 174 int stride, int tx_type) { 175 if (tx_type == DCT_DCT) { 176 vp9_fdct4x4_c(input, output, stride); 177 } else { 178 tran_low_t out[4 * 4]; 179 tran_low_t *outptr = &out[0]; 180 int i, j; 181 tran_low_t temp_in[4], temp_out[4]; 182 const transform_2d ht = FHT_4[tx_type]; 183 184 // Columns 185 for (i = 0; i < 4; ++i) { 186 for (j = 0; j < 4; ++j) 187 temp_in[j] = input[j * stride + i] * 16; 188 if (i == 0 && temp_in[0]) 189 temp_in[0] += 1; 190 ht.cols(temp_in, temp_out); 191 for (j = 0; j < 4; ++j) 192 outptr[j * 4 + i] = temp_out[j]; 193 } 194 195 // Rows 196 for (i = 0; i < 4; ++i) { 197 for (j = 0; j < 4; ++j) 198 temp_in[j] = out[j + i * 4]; 199 ht.rows(temp_in, temp_out); 200 for (j = 0; j < 4; ++j) 201 output[j + i * 4] = (temp_out[j] + 1) >> 2; 202 } 203 } 204} 205 206static void fdct8(const tran_low_t *input, tran_low_t *output) { 207 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 208 tran_high_t t0, t1, t2, t3; // needs32 209 tran_high_t x0, x1, x2, x3; // canbe16 210 211 // stage 1 212 s0 = input[0] + input[7]; 213 s1 = input[1] + input[6]; 214 s2 = input[2] + input[5]; 215 s3 = input[3] + input[4]; 216 s4 = input[3] - input[4]; 217 s5 = input[2] - input[5]; 218 s6 = input[1] - input[6]; 219 s7 = input[0] - input[7]; 220 221 // fdct4(step, step); 222 x0 = s0 + s3; 223 x1 = s1 + s2; 224 x2 = s1 - s2; 225 x3 = s0 - s3; 226 t0 = (x0 + x1) * cospi_16_64; 227 t1 = (x0 - x1) * cospi_16_64; 228 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 229 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 230 output[0] = fdct_round_shift(t0); 231 output[2] = fdct_round_shift(t2); 232 output[4] = fdct_round_shift(t1); 233 output[6] = fdct_round_shift(t3); 234 235 // Stage 2 236 t0 = (s6 - s5) * cospi_16_64; 237 t1 = (s6 + s5) * cospi_16_64; 238 t2 = fdct_round_shift(t0); 239 t3 = fdct_round_shift(t1); 240 241 // Stage 3 242 x0 = s4 + t2; 243 x1 = s4 - t2; 244 x2 = s7 - t3; 245 x3 = s7 + t3; 246 247 // Stage 4 248 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 249 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 250 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 251 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 252 output[1] = fdct_round_shift(t0); 253 output[3] = fdct_round_shift(t2); 254 output[5] = fdct_round_shift(t1); 255 output[7] = fdct_round_shift(t3); 256} 257 258void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { 259 int r, c; 260 tran_low_t sum = 0; 261 for (r = 0; r < 8; ++r) 262 for (c = 0; c < 8; ++c) 263 sum += input[r * stride + c]; 264 265 output[0] = sum; 266 output[1] = 0; 267} 268 269void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { 270 int i, j; 271 tran_low_t intermediate[64]; 272 273 // Transform columns 274 { 275 tran_low_t *output = intermediate; 276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 277 tran_high_t t0, t1, t2, t3; // needs32 278 tran_high_t x0, x1, x2, x3; // canbe16 279 280 int i; 281 for (i = 0; i < 8; i++) { 282 // stage 1 283 s0 = (input[0 * stride] + input[7 * stride]) * 4; 284 s1 = (input[1 * stride] + input[6 * stride]) * 4; 285 s2 = (input[2 * stride] + input[5 * stride]) * 4; 286 s3 = (input[3 * stride] + input[4 * stride]) * 4; 287 s4 = (input[3 * stride] - input[4 * stride]) * 4; 288 s5 = (input[2 * stride] - input[5 * stride]) * 4; 289 s6 = (input[1 * stride] - input[6 * stride]) * 4; 290 s7 = (input[0 * stride] - input[7 * stride]) * 4; 291 292 // fdct4(step, step); 293 x0 = s0 + s3; 294 x1 = s1 + s2; 295 x2 = s1 - s2; 296 x3 = s0 - s3; 297 t0 = (x0 + x1) * cospi_16_64; 298 t1 = (x0 - x1) * cospi_16_64; 299 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 300 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 301 output[0 * 8] = fdct_round_shift(t0); 302 output[2 * 8] = fdct_round_shift(t2); 303 output[4 * 8] = fdct_round_shift(t1); 304 output[6 * 8] = fdct_round_shift(t3); 305 306 // Stage 2 307 t0 = (s6 - s5) * cospi_16_64; 308 t1 = (s6 + s5) * cospi_16_64; 309 t2 = fdct_round_shift(t0); 310 t3 = fdct_round_shift(t1); 311 312 // Stage 3 313 x0 = s4 + t2; 314 x1 = s4 - t2; 315 x2 = s7 - t3; 316 x3 = s7 + t3; 317 318 // Stage 4 319 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 320 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 321 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 322 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 323 output[1 * 8] = fdct_round_shift(t0); 324 output[3 * 8] = fdct_round_shift(t2); 325 output[5 * 8] = fdct_round_shift(t1); 326 output[7 * 8] = fdct_round_shift(t3); 327 input++; 328 output++; 329 } 330 } 331 332 // Rows 333 for (i = 0; i < 8; ++i) { 334 fdct8(&intermediate[i * 8], &final_output[i * 8]); 335 for (j = 0; j < 8; ++j) 336 final_output[j + i * 8] /= 2; 337 } 338} 339 340void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { 341 int r, c; 342 tran_low_t sum = 0; 343 for (r = 0; r < 16; ++r) 344 for (c = 0; c < 16; ++c) 345 sum += input[r * stride + c]; 346 347 output[0] = sum >> 1; 348 output[1] = 0; 349} 350 351void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { 352 // The 2D transform is done with two passes which are actually pretty 353 // similar. In the first one, we transform the columns and transpose 354 // the results. In the second one, we transform the rows. To achieve that, 355 // as the first pass results are transposed, we transpose the columns (that 356 // is the transposed rows) and transpose the results (so that it goes back 357 // in normal/row positions). 358 int pass; 359 // We need an intermediate buffer between passes. 360 tran_low_t intermediate[256]; 361 const int16_t *in_pass0 = input; 362 const tran_low_t *in = NULL; 363 tran_low_t *out = intermediate; 364 // Do the two transform/transpose passes 365 for (pass = 0; pass < 2; ++pass) { 366 tran_high_t step1[8]; // canbe16 367 tran_high_t step2[8]; // canbe16 368 tran_high_t step3[8]; // canbe16 369 tran_high_t input[8]; // canbe16 370 tran_high_t temp1, temp2; // needs32 371 int i; 372 for (i = 0; i < 16; i++) { 373 if (0 == pass) { 374 // Calculate input for the first 8 results. 375 input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4; 376 input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4; 377 input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4; 378 input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4; 379 input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4; 380 input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4; 381 input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4; 382 input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4; 383 // Calculate input for the next 8 results. 384 step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4; 385 step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4; 386 step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4; 387 step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4; 388 step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4; 389 step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4; 390 step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4; 391 step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4; 392 } else { 393 // Calculate input for the first 8 results. 394 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); 395 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); 396 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); 397 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); 398 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); 399 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); 400 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); 401 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); 402 // Calculate input for the next 8 results. 403 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); 404 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); 405 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); 406 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); 407 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); 408 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); 409 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); 410 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); 411 } 412 // Work on the first eight values; fdct8(input, even_results); 413 { 414 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 415 tran_high_t t0, t1, t2, t3; // needs32 416 tran_high_t x0, x1, x2, x3; // canbe16 417 418 // stage 1 419 s0 = input[0] + input[7]; 420 s1 = input[1] + input[6]; 421 s2 = input[2] + input[5]; 422 s3 = input[3] + input[4]; 423 s4 = input[3] - input[4]; 424 s5 = input[2] - input[5]; 425 s6 = input[1] - input[6]; 426 s7 = input[0] - input[7]; 427 428 // fdct4(step, step); 429 x0 = s0 + s3; 430 x1 = s1 + s2; 431 x2 = s1 - s2; 432 x3 = s0 - s3; 433 t0 = (x0 + x1) * cospi_16_64; 434 t1 = (x0 - x1) * cospi_16_64; 435 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 436 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 437 out[0] = fdct_round_shift(t0); 438 out[4] = fdct_round_shift(t2); 439 out[8] = fdct_round_shift(t1); 440 out[12] = fdct_round_shift(t3); 441 442 // Stage 2 443 t0 = (s6 - s5) * cospi_16_64; 444 t1 = (s6 + s5) * cospi_16_64; 445 t2 = fdct_round_shift(t0); 446 t3 = fdct_round_shift(t1); 447 448 // Stage 3 449 x0 = s4 + t2; 450 x1 = s4 - t2; 451 x2 = s7 - t3; 452 x3 = s7 + t3; 453 454 // Stage 4 455 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 456 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 457 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 458 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 459 out[2] = fdct_round_shift(t0); 460 out[6] = fdct_round_shift(t2); 461 out[10] = fdct_round_shift(t1); 462 out[14] = fdct_round_shift(t3); 463 } 464 // Work on the next eight values; step1 -> odd_results 465 { 466 // step 2 467 temp1 = (step1[5] - step1[2]) * cospi_16_64; 468 temp2 = (step1[4] - step1[3]) * cospi_16_64; 469 step2[2] = fdct_round_shift(temp1); 470 step2[3] = fdct_round_shift(temp2); 471 temp1 = (step1[4] + step1[3]) * cospi_16_64; 472 temp2 = (step1[5] + step1[2]) * cospi_16_64; 473 step2[4] = fdct_round_shift(temp1); 474 step2[5] = fdct_round_shift(temp2); 475 // step 3 476 step3[0] = step1[0] + step2[3]; 477 step3[1] = step1[1] + step2[2]; 478 step3[2] = step1[1] - step2[2]; 479 step3[3] = step1[0] - step2[3]; 480 step3[4] = step1[7] - step2[4]; 481 step3[5] = step1[6] - step2[5]; 482 step3[6] = step1[6] + step2[5]; 483 step3[7] = step1[7] + step2[4]; 484 // step 4 485 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 486 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; 487 step2[1] = fdct_round_shift(temp1); 488 step2[2] = fdct_round_shift(temp2); 489 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; 490 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 491 step2[5] = fdct_round_shift(temp1); 492 step2[6] = fdct_round_shift(temp2); 493 // step 5 494 step1[0] = step3[0] + step2[1]; 495 step1[1] = step3[0] - step2[1]; 496 step1[2] = step3[3] + step2[2]; 497 step1[3] = step3[3] - step2[2]; 498 step1[4] = step3[4] - step2[5]; 499 step1[5] = step3[4] + step2[5]; 500 step1[6] = step3[7] - step2[6]; 501 step1[7] = step3[7] + step2[6]; 502 // step 6 503 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 504 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 505 out[1] = fdct_round_shift(temp1); 506 out[9] = fdct_round_shift(temp2); 507 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 508 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 509 out[5] = fdct_round_shift(temp1); 510 out[13] = fdct_round_shift(temp2); 511 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 512 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 513 out[3] = fdct_round_shift(temp1); 514 out[11] = fdct_round_shift(temp2); 515 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 516 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 517 out[7] = fdct_round_shift(temp1); 518 out[15] = fdct_round_shift(temp2); 519 } 520 // Do next column (which is a transposed row in second/horizontal pass) 521 in++; 522 in_pass0++; 523 out += 16; 524 } 525 // Setup in/out for next pass. 526 in = intermediate; 527 out = output; 528 } 529} 530 531static void fadst8(const tran_low_t *input, tran_low_t *output) { 532 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 533 534 tran_high_t x0 = input[7]; 535 tran_high_t x1 = input[0]; 536 tran_high_t x2 = input[5]; 537 tran_high_t x3 = input[2]; 538 tran_high_t x4 = input[3]; 539 tran_high_t x5 = input[4]; 540 tran_high_t x6 = input[1]; 541 tran_high_t x7 = input[6]; 542 543 // stage 1 544 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 545 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 546 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 547 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 548 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 549 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 550 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 551 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 552 553 x0 = fdct_round_shift(s0 + s4); 554 x1 = fdct_round_shift(s1 + s5); 555 x2 = fdct_round_shift(s2 + s6); 556 x3 = fdct_round_shift(s3 + s7); 557 x4 = fdct_round_shift(s0 - s4); 558 x5 = fdct_round_shift(s1 - s5); 559 x6 = fdct_round_shift(s2 - s6); 560 x7 = fdct_round_shift(s3 - s7); 561 562 // stage 2 563 s0 = x0; 564 s1 = x1; 565 s2 = x2; 566 s3 = x3; 567 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 568 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 569 s6 = - cospi_24_64 * x6 + cospi_8_64 * x7; 570 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 571 572 x0 = s0 + s2; 573 x1 = s1 + s3; 574 x2 = s0 - s2; 575 x3 = s1 - s3; 576 x4 = fdct_round_shift(s4 + s6); 577 x5 = fdct_round_shift(s5 + s7); 578 x6 = fdct_round_shift(s4 - s6); 579 x7 = fdct_round_shift(s5 - s7); 580 581 // stage 3 582 s2 = cospi_16_64 * (x2 + x3); 583 s3 = cospi_16_64 * (x2 - x3); 584 s6 = cospi_16_64 * (x6 + x7); 585 s7 = cospi_16_64 * (x6 - x7); 586 587 x2 = fdct_round_shift(s2); 588 x3 = fdct_round_shift(s3); 589 x6 = fdct_round_shift(s6); 590 x7 = fdct_round_shift(s7); 591 592 output[0] = x0; 593 output[1] = - x4; 594 output[2] = x6; 595 output[3] = - x2; 596 output[4] = x3; 597 output[5] = - x7; 598 output[6] = x5; 599 output[7] = - x1; 600} 601 602static const transform_2d FHT_8[] = { 603 { fdct8, fdct8 }, // DCT_DCT = 0 604 { fadst8, fdct8 }, // ADST_DCT = 1 605 { fdct8, fadst8 }, // DCT_ADST = 2 606 { fadst8, fadst8 } // ADST_ADST = 3 607}; 608 609void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, 610 int stride, int tx_type) { 611 if (tx_type == DCT_DCT) { 612 vp9_fdct8x8_c(input, output, stride); 613 } else { 614 tran_low_t out[64]; 615 tran_low_t *outptr = &out[0]; 616 int i, j; 617 tran_low_t temp_in[8], temp_out[8]; 618 const transform_2d ht = FHT_8[tx_type]; 619 620 // Columns 621 for (i = 0; i < 8; ++i) { 622 for (j = 0; j < 8; ++j) 623 temp_in[j] = input[j * stride + i] * 4; 624 ht.cols(temp_in, temp_out); 625 for (j = 0; j < 8; ++j) 626 outptr[j * 8 + i] = temp_out[j]; 627 } 628 629 // Rows 630 for (i = 0; i < 8; ++i) { 631 for (j = 0; j < 8; ++j) 632 temp_in[j] = out[j + i * 8]; 633 ht.rows(temp_in, temp_out); 634 for (j = 0; j < 8; ++j) 635 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; 636 } 637 } 638} 639 640/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per 641 pixel. */ 642void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { 643 int i; 644 tran_high_t a1, b1, c1, d1, e1; 645 const int16_t *ip_pass0 = input; 646 const tran_low_t *ip = NULL; 647 tran_low_t *op = output; 648 649 for (i = 0; i < 4; i++) { 650 a1 = ip_pass0[0 * stride]; 651 b1 = ip_pass0[1 * stride]; 652 c1 = ip_pass0[2 * stride]; 653 d1 = ip_pass0[3 * stride]; 654 655 a1 += b1; 656 d1 = d1 - c1; 657 e1 = (a1 - d1) >> 1; 658 b1 = e1 - b1; 659 c1 = e1 - c1; 660 a1 -= c1; 661 d1 += b1; 662 op[0] = a1; 663 op[4] = c1; 664 op[8] = d1; 665 op[12] = b1; 666 667 ip_pass0++; 668 op++; 669 } 670 ip = output; 671 op = output; 672 673 for (i = 0; i < 4; i++) { 674 a1 = ip[0]; 675 b1 = ip[1]; 676 c1 = ip[2]; 677 d1 = ip[3]; 678 679 a1 += b1; 680 d1 -= c1; 681 e1 = (a1 - d1) >> 1; 682 b1 = e1 - b1; 683 c1 = e1 - c1; 684 a1 -= c1; 685 d1 += b1; 686 op[0] = a1 * UNIT_QUANT_FACTOR; 687 op[1] = c1 * UNIT_QUANT_FACTOR; 688 op[2] = d1 * UNIT_QUANT_FACTOR; 689 op[3] = b1 * UNIT_QUANT_FACTOR; 690 691 ip += 4; 692 op += 4; 693 } 694} 695 696// Rewrote to use same algorithm as others. 697static void fdct16(const tran_low_t in[16], tran_low_t out[16]) { 698 tran_high_t step1[8]; // canbe16 699 tran_high_t step2[8]; // canbe16 700 tran_high_t step3[8]; // canbe16 701 tran_high_t input[8]; // canbe16 702 tran_high_t temp1, temp2; // needs32 703 704 // step 1 705 input[0] = in[0] + in[15]; 706 input[1] = in[1] + in[14]; 707 input[2] = in[2] + in[13]; 708 input[3] = in[3] + in[12]; 709 input[4] = in[4] + in[11]; 710 input[5] = in[5] + in[10]; 711 input[6] = in[6] + in[ 9]; 712 input[7] = in[7] + in[ 8]; 713 714 step1[0] = in[7] - in[ 8]; 715 step1[1] = in[6] - in[ 9]; 716 step1[2] = in[5] - in[10]; 717 step1[3] = in[4] - in[11]; 718 step1[4] = in[3] - in[12]; 719 step1[5] = in[2] - in[13]; 720 step1[6] = in[1] - in[14]; 721 step1[7] = in[0] - in[15]; 722 723 // fdct8(step, step); 724 { 725 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 726 tran_high_t t0, t1, t2, t3; // needs32 727 tran_high_t x0, x1, x2, x3; // canbe16 728 729 // stage 1 730 s0 = input[0] + input[7]; 731 s1 = input[1] + input[6]; 732 s2 = input[2] + input[5]; 733 s3 = input[3] + input[4]; 734 s4 = input[3] - input[4]; 735 s5 = input[2] - input[5]; 736 s6 = input[1] - input[6]; 737 s7 = input[0] - input[7]; 738 739 // fdct4(step, step); 740 x0 = s0 + s3; 741 x1 = s1 + s2; 742 x2 = s1 - s2; 743 x3 = s0 - s3; 744 t0 = (x0 + x1) * cospi_16_64; 745 t1 = (x0 - x1) * cospi_16_64; 746 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 747 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 748 out[0] = fdct_round_shift(t0); 749 out[4] = fdct_round_shift(t2); 750 out[8] = fdct_round_shift(t1); 751 out[12] = fdct_round_shift(t3); 752 753 // Stage 2 754 t0 = (s6 - s5) * cospi_16_64; 755 t1 = (s6 + s5) * cospi_16_64; 756 t2 = fdct_round_shift(t0); 757 t3 = fdct_round_shift(t1); 758 759 // Stage 3 760 x0 = s4 + t2; 761 x1 = s4 - t2; 762 x2 = s7 - t3; 763 x3 = s7 + t3; 764 765 // Stage 4 766 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 767 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 768 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 769 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 770 out[2] = fdct_round_shift(t0); 771 out[6] = fdct_round_shift(t2); 772 out[10] = fdct_round_shift(t1); 773 out[14] = fdct_round_shift(t3); 774 } 775 776 // step 2 777 temp1 = (step1[5] - step1[2]) * cospi_16_64; 778 temp2 = (step1[4] - step1[3]) * cospi_16_64; 779 step2[2] = fdct_round_shift(temp1); 780 step2[3] = fdct_round_shift(temp2); 781 temp1 = (step1[4] + step1[3]) * cospi_16_64; 782 temp2 = (step1[5] + step1[2]) * cospi_16_64; 783 step2[4] = fdct_round_shift(temp1); 784 step2[5] = fdct_round_shift(temp2); 785 786 // step 3 787 step3[0] = step1[0] + step2[3]; 788 step3[1] = step1[1] + step2[2]; 789 step3[2] = step1[1] - step2[2]; 790 step3[3] = step1[0] - step2[3]; 791 step3[4] = step1[7] - step2[4]; 792 step3[5] = step1[6] - step2[5]; 793 step3[6] = step1[6] + step2[5]; 794 step3[7] = step1[7] + step2[4]; 795 796 // step 4 797 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 798 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; 799 step2[1] = fdct_round_shift(temp1); 800 step2[2] = fdct_round_shift(temp2); 801 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; 802 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 803 step2[5] = fdct_round_shift(temp1); 804 step2[6] = fdct_round_shift(temp2); 805 806 // step 5 807 step1[0] = step3[0] + step2[1]; 808 step1[1] = step3[0] - step2[1]; 809 step1[2] = step3[3] + step2[2]; 810 step1[3] = step3[3] - step2[2]; 811 step1[4] = step3[4] - step2[5]; 812 step1[5] = step3[4] + step2[5]; 813 step1[6] = step3[7] - step2[6]; 814 step1[7] = step3[7] + step2[6]; 815 816 // step 6 817 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 818 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 819 out[1] = fdct_round_shift(temp1); 820 out[9] = fdct_round_shift(temp2); 821 822 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 823 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 824 out[5] = fdct_round_shift(temp1); 825 out[13] = fdct_round_shift(temp2); 826 827 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 828 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 829 out[3] = fdct_round_shift(temp1); 830 out[11] = fdct_round_shift(temp2); 831 832 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 833 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 834 out[7] = fdct_round_shift(temp1); 835 out[15] = fdct_round_shift(temp2); 836} 837 838static void fadst16(const tran_low_t *input, tran_low_t *output) { 839 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 840 tran_high_t s9, s10, s11, s12, s13, s14, s15; 841 842 tran_high_t x0 = input[15]; 843 tran_high_t x1 = input[0]; 844 tran_high_t x2 = input[13]; 845 tran_high_t x3 = input[2]; 846 tran_high_t x4 = input[11]; 847 tran_high_t x5 = input[4]; 848 tran_high_t x6 = input[9]; 849 tran_high_t x7 = input[6]; 850 tran_high_t x8 = input[7]; 851 tran_high_t x9 = input[8]; 852 tran_high_t x10 = input[5]; 853 tran_high_t x11 = input[10]; 854 tran_high_t x12 = input[3]; 855 tran_high_t x13 = input[12]; 856 tran_high_t x14 = input[1]; 857 tran_high_t x15 = input[14]; 858 859 // stage 1 860 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 861 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 862 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 863 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 864 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 865 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 866 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 867 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 868 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 869 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 870 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 871 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 872 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 873 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 874 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 875 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 876 877 x0 = fdct_round_shift(s0 + s8); 878 x1 = fdct_round_shift(s1 + s9); 879 x2 = fdct_round_shift(s2 + s10); 880 x3 = fdct_round_shift(s3 + s11); 881 x4 = fdct_round_shift(s4 + s12); 882 x5 = fdct_round_shift(s5 + s13); 883 x6 = fdct_round_shift(s6 + s14); 884 x7 = fdct_round_shift(s7 + s15); 885 x8 = fdct_round_shift(s0 - s8); 886 x9 = fdct_round_shift(s1 - s9); 887 x10 = fdct_round_shift(s2 - s10); 888 x11 = fdct_round_shift(s3 - s11); 889 x12 = fdct_round_shift(s4 - s12); 890 x13 = fdct_round_shift(s5 - s13); 891 x14 = fdct_round_shift(s6 - s14); 892 x15 = fdct_round_shift(s7 - s15); 893 894 // stage 2 895 s0 = x0; 896 s1 = x1; 897 s2 = x2; 898 s3 = x3; 899 s4 = x4; 900 s5 = x5; 901 s6 = x6; 902 s7 = x7; 903 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 904 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 905 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 906 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 907 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 908 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 909 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 910 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 911 912 x0 = s0 + s4; 913 x1 = s1 + s5; 914 x2 = s2 + s6; 915 x3 = s3 + s7; 916 x4 = s0 - s4; 917 x5 = s1 - s5; 918 x6 = s2 - s6; 919 x7 = s3 - s7; 920 x8 = fdct_round_shift(s8 + s12); 921 x9 = fdct_round_shift(s9 + s13); 922 x10 = fdct_round_shift(s10 + s14); 923 x11 = fdct_round_shift(s11 + s15); 924 x12 = fdct_round_shift(s8 - s12); 925 x13 = fdct_round_shift(s9 - s13); 926 x14 = fdct_round_shift(s10 - s14); 927 x15 = fdct_round_shift(s11 - s15); 928 929 // stage 3 930 s0 = x0; 931 s1 = x1; 932 s2 = x2; 933 s3 = x3; 934 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 935 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 936 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 937 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 938 s8 = x8; 939 s9 = x9; 940 s10 = x10; 941 s11 = x11; 942 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 943 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 944 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 945 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 946 947 x0 = s0 + s2; 948 x1 = s1 + s3; 949 x2 = s0 - s2; 950 x3 = s1 - s3; 951 x4 = fdct_round_shift(s4 + s6); 952 x5 = fdct_round_shift(s5 + s7); 953 x6 = fdct_round_shift(s4 - s6); 954 x7 = fdct_round_shift(s5 - s7); 955 x8 = s8 + s10; 956 x9 = s9 + s11; 957 x10 = s8 - s10; 958 x11 = s9 - s11; 959 x12 = fdct_round_shift(s12 + s14); 960 x13 = fdct_round_shift(s13 + s15); 961 x14 = fdct_round_shift(s12 - s14); 962 x15 = fdct_round_shift(s13 - s15); 963 964 // stage 4 965 s2 = (- cospi_16_64) * (x2 + x3); 966 s3 = cospi_16_64 * (x2 - x3); 967 s6 = cospi_16_64 * (x6 + x7); 968 s7 = cospi_16_64 * (- x6 + x7); 969 s10 = cospi_16_64 * (x10 + x11); 970 s11 = cospi_16_64 * (- x10 + x11); 971 s14 = (- cospi_16_64) * (x14 + x15); 972 s15 = cospi_16_64 * (x14 - x15); 973 974 x2 = fdct_round_shift(s2); 975 x3 = fdct_round_shift(s3); 976 x6 = fdct_round_shift(s6); 977 x7 = fdct_round_shift(s7); 978 x10 = fdct_round_shift(s10); 979 x11 = fdct_round_shift(s11); 980 x14 = fdct_round_shift(s14); 981 x15 = fdct_round_shift(s15); 982 983 output[0] = x0; 984 output[1] = - x8; 985 output[2] = x12; 986 output[3] = - x4; 987 output[4] = x6; 988 output[5] = x14; 989 output[6] = x10; 990 output[7] = x2; 991 output[8] = x3; 992 output[9] = x11; 993 output[10] = x15; 994 output[11] = x7; 995 output[12] = x5; 996 output[13] = - x13; 997 output[14] = x9; 998 output[15] = - x1; 999} 1000 1001static const transform_2d FHT_16[] = { 1002 { fdct16, fdct16 }, // DCT_DCT = 0 1003 { fadst16, fdct16 }, // ADST_DCT = 1 1004 { fdct16, fadst16 }, // DCT_ADST = 2 1005 { fadst16, fadst16 } // ADST_ADST = 3 1006}; 1007 1008void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, 1009 int stride, int tx_type) { 1010 if (tx_type == DCT_DCT) { 1011 vp9_fdct16x16_c(input, output, stride); 1012 } else { 1013 tran_low_t out[256]; 1014 tran_low_t *outptr = &out[0]; 1015 int i, j; 1016 tran_low_t temp_in[16], temp_out[16]; 1017 const transform_2d ht = FHT_16[tx_type]; 1018 1019 // Columns 1020 for (i = 0; i < 16; ++i) { 1021 for (j = 0; j < 16; ++j) 1022 temp_in[j] = input[j * stride + i] * 4; 1023 ht.cols(temp_in, temp_out); 1024 for (j = 0; j < 16; ++j) 1025 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 1026 } 1027 1028 // Rows 1029 for (i = 0; i < 16; ++i) { 1030 for (j = 0; j < 16; ++j) 1031 temp_in[j] = out[j + i * 16]; 1032 ht.rows(temp_in, temp_out); 1033 for (j = 0; j < 16; ++j) 1034 output[j + i * 16] = temp_out[j]; 1035 } 1036 } 1037} 1038 1039static INLINE tran_high_t dct_32_round(tran_high_t input) { 1040 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 1041 // TODO(debargha, peter.derivaz): Find new bounds for this assert, 1042 // and make the bounds consts. 1043 // assert(-131072 <= rv && rv <= 131071); 1044 return rv; 1045} 1046 1047static INLINE tran_high_t half_round_shift(tran_high_t input) { 1048 tran_high_t rv = (input + 1 + (input < 0)) >> 2; 1049 return rv; 1050} 1051 1052static void fdct32(const tran_high_t *input, tran_high_t *output, int round) { 1053 tran_high_t step[32]; 1054 // Stage 1 1055 step[0] = input[0] + input[(32 - 1)]; 1056 step[1] = input[1] + input[(32 - 2)]; 1057 step[2] = input[2] + input[(32 - 3)]; 1058 step[3] = input[3] + input[(32 - 4)]; 1059 step[4] = input[4] + input[(32 - 5)]; 1060 step[5] = input[5] + input[(32 - 6)]; 1061 step[6] = input[6] + input[(32 - 7)]; 1062 step[7] = input[7] + input[(32 - 8)]; 1063 step[8] = input[8] + input[(32 - 9)]; 1064 step[9] = input[9] + input[(32 - 10)]; 1065 step[10] = input[10] + input[(32 - 11)]; 1066 step[11] = input[11] + input[(32 - 12)]; 1067 step[12] = input[12] + input[(32 - 13)]; 1068 step[13] = input[13] + input[(32 - 14)]; 1069 step[14] = input[14] + input[(32 - 15)]; 1070 step[15] = input[15] + input[(32 - 16)]; 1071 step[16] = -input[16] + input[(32 - 17)]; 1072 step[17] = -input[17] + input[(32 - 18)]; 1073 step[18] = -input[18] + input[(32 - 19)]; 1074 step[19] = -input[19] + input[(32 - 20)]; 1075 step[20] = -input[20] + input[(32 - 21)]; 1076 step[21] = -input[21] + input[(32 - 22)]; 1077 step[22] = -input[22] + input[(32 - 23)]; 1078 step[23] = -input[23] + input[(32 - 24)]; 1079 step[24] = -input[24] + input[(32 - 25)]; 1080 step[25] = -input[25] + input[(32 - 26)]; 1081 step[26] = -input[26] + input[(32 - 27)]; 1082 step[27] = -input[27] + input[(32 - 28)]; 1083 step[28] = -input[28] + input[(32 - 29)]; 1084 step[29] = -input[29] + input[(32 - 30)]; 1085 step[30] = -input[30] + input[(32 - 31)]; 1086 step[31] = -input[31] + input[(32 - 32)]; 1087 1088 // Stage 2 1089 output[0] = step[0] + step[16 - 1]; 1090 output[1] = step[1] + step[16 - 2]; 1091 output[2] = step[2] + step[16 - 3]; 1092 output[3] = step[3] + step[16 - 4]; 1093 output[4] = step[4] + step[16 - 5]; 1094 output[5] = step[5] + step[16 - 6]; 1095 output[6] = step[6] + step[16 - 7]; 1096 output[7] = step[7] + step[16 - 8]; 1097 output[8] = -step[8] + step[16 - 9]; 1098 output[9] = -step[9] + step[16 - 10]; 1099 output[10] = -step[10] + step[16 - 11]; 1100 output[11] = -step[11] + step[16 - 12]; 1101 output[12] = -step[12] + step[16 - 13]; 1102 output[13] = -step[13] + step[16 - 14]; 1103 output[14] = -step[14] + step[16 - 15]; 1104 output[15] = -step[15] + step[16 - 16]; 1105 1106 output[16] = step[16]; 1107 output[17] = step[17]; 1108 output[18] = step[18]; 1109 output[19] = step[19]; 1110 1111 output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); 1112 output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); 1113 output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); 1114 output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); 1115 1116 output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); 1117 output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); 1118 output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); 1119 output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); 1120 1121 output[28] = step[28]; 1122 output[29] = step[29]; 1123 output[30] = step[30]; 1124 output[31] = step[31]; 1125 1126 // dump the magnitude by 4, hence the intermediate values are within 1127 // the range of 16 bits. 1128 if (round) { 1129 output[0] = half_round_shift(output[0]); 1130 output[1] = half_round_shift(output[1]); 1131 output[2] = half_round_shift(output[2]); 1132 output[3] = half_round_shift(output[3]); 1133 output[4] = half_round_shift(output[4]); 1134 output[5] = half_round_shift(output[5]); 1135 output[6] = half_round_shift(output[6]); 1136 output[7] = half_round_shift(output[7]); 1137 output[8] = half_round_shift(output[8]); 1138 output[9] = half_round_shift(output[9]); 1139 output[10] = half_round_shift(output[10]); 1140 output[11] = half_round_shift(output[11]); 1141 output[12] = half_round_shift(output[12]); 1142 output[13] = half_round_shift(output[13]); 1143 output[14] = half_round_shift(output[14]); 1144 output[15] = half_round_shift(output[15]); 1145 1146 output[16] = half_round_shift(output[16]); 1147 output[17] = half_round_shift(output[17]); 1148 output[18] = half_round_shift(output[18]); 1149 output[19] = half_round_shift(output[19]); 1150 output[20] = half_round_shift(output[20]); 1151 output[21] = half_round_shift(output[21]); 1152 output[22] = half_round_shift(output[22]); 1153 output[23] = half_round_shift(output[23]); 1154 output[24] = half_round_shift(output[24]); 1155 output[25] = half_round_shift(output[25]); 1156 output[26] = half_round_shift(output[26]); 1157 output[27] = half_round_shift(output[27]); 1158 output[28] = half_round_shift(output[28]); 1159 output[29] = half_round_shift(output[29]); 1160 output[30] = half_round_shift(output[30]); 1161 output[31] = half_round_shift(output[31]); 1162 } 1163 1164 // Stage 3 1165 step[0] = output[0] + output[(8 - 1)]; 1166 step[1] = output[1] + output[(8 - 2)]; 1167 step[2] = output[2] + output[(8 - 3)]; 1168 step[3] = output[3] + output[(8 - 4)]; 1169 step[4] = -output[4] + output[(8 - 5)]; 1170 step[5] = -output[5] + output[(8 - 6)]; 1171 step[6] = -output[6] + output[(8 - 7)]; 1172 step[7] = -output[7] + output[(8 - 8)]; 1173 step[8] = output[8]; 1174 step[9] = output[9]; 1175 step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); 1176 step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); 1177 step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); 1178 step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); 1179 step[14] = output[14]; 1180 step[15] = output[15]; 1181 1182 step[16] = output[16] + output[23]; 1183 step[17] = output[17] + output[22]; 1184 step[18] = output[18] + output[21]; 1185 step[19] = output[19] + output[20]; 1186 step[20] = -output[20] + output[19]; 1187 step[21] = -output[21] + output[18]; 1188 step[22] = -output[22] + output[17]; 1189 step[23] = -output[23] + output[16]; 1190 step[24] = -output[24] + output[31]; 1191 step[25] = -output[25] + output[30]; 1192 step[26] = -output[26] + output[29]; 1193 step[27] = -output[27] + output[28]; 1194 step[28] = output[28] + output[27]; 1195 step[29] = output[29] + output[26]; 1196 step[30] = output[30] + output[25]; 1197 step[31] = output[31] + output[24]; 1198 1199 // Stage 4 1200 output[0] = step[0] + step[3]; 1201 output[1] = step[1] + step[2]; 1202 output[2] = -step[2] + step[1]; 1203 output[3] = -step[3] + step[0]; 1204 output[4] = step[4]; 1205 output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); 1206 output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); 1207 output[7] = step[7]; 1208 output[8] = step[8] + step[11]; 1209 output[9] = step[9] + step[10]; 1210 output[10] = -step[10] + step[9]; 1211 output[11] = -step[11] + step[8]; 1212 output[12] = -step[12] + step[15]; 1213 output[13] = -step[13] + step[14]; 1214 output[14] = step[14] + step[13]; 1215 output[15] = step[15] + step[12]; 1216 1217 output[16] = step[16]; 1218 output[17] = step[17]; 1219 output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); 1220 output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); 1221 output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); 1222 output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); 1223 output[22] = step[22]; 1224 output[23] = step[23]; 1225 output[24] = step[24]; 1226 output[25] = step[25]; 1227 output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); 1228 output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); 1229 output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); 1230 output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); 1231 output[30] = step[30]; 1232 output[31] = step[31]; 1233 1234 // Stage 5 1235 step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); 1236 step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); 1237 step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); 1238 step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); 1239 step[4] = output[4] + output[5]; 1240 step[5] = -output[5] + output[4]; 1241 step[6] = -output[6] + output[7]; 1242 step[7] = output[7] + output[6]; 1243 step[8] = output[8]; 1244 step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); 1245 step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); 1246 step[11] = output[11]; 1247 step[12] = output[12]; 1248 step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); 1249 step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); 1250 step[15] = output[15]; 1251 1252 step[16] = output[16] + output[19]; 1253 step[17] = output[17] + output[18]; 1254 step[18] = -output[18] + output[17]; 1255 step[19] = -output[19] + output[16]; 1256 step[20] = -output[20] + output[23]; 1257 step[21] = -output[21] + output[22]; 1258 step[22] = output[22] + output[21]; 1259 step[23] = output[23] + output[20]; 1260 step[24] = output[24] + output[27]; 1261 step[25] = output[25] + output[26]; 1262 step[26] = -output[26] + output[25]; 1263 step[27] = -output[27] + output[24]; 1264 step[28] = -output[28] + output[31]; 1265 step[29] = -output[29] + output[30]; 1266 step[30] = output[30] + output[29]; 1267 step[31] = output[31] + output[28]; 1268 1269 // Stage 6 1270 output[0] = step[0]; 1271 output[1] = step[1]; 1272 output[2] = step[2]; 1273 output[3] = step[3]; 1274 output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); 1275 output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); 1276 output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); 1277 output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); 1278 output[8] = step[8] + step[9]; 1279 output[9] = -step[9] + step[8]; 1280 output[10] = -step[10] + step[11]; 1281 output[11] = step[11] + step[10]; 1282 output[12] = step[12] + step[13]; 1283 output[13] = -step[13] + step[12]; 1284 output[14] = -step[14] + step[15]; 1285 output[15] = step[15] + step[14]; 1286 1287 output[16] = step[16]; 1288 output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); 1289 output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); 1290 output[19] = step[19]; 1291 output[20] = step[20]; 1292 output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); 1293 output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); 1294 output[23] = step[23]; 1295 output[24] = step[24]; 1296 output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); 1297 output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); 1298 output[27] = step[27]; 1299 output[28] = step[28]; 1300 output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); 1301 output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); 1302 output[31] = step[31]; 1303 1304 // Stage 7 1305 step[0] = output[0]; 1306 step[1] = output[1]; 1307 step[2] = output[2]; 1308 step[3] = output[3]; 1309 step[4] = output[4]; 1310 step[5] = output[5]; 1311 step[6] = output[6]; 1312 step[7] = output[7]; 1313 step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); 1314 step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); 1315 step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); 1316 step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); 1317 step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); 1318 step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); 1319 step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); 1320 step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); 1321 1322 step[16] = output[16] + output[17]; 1323 step[17] = -output[17] + output[16]; 1324 step[18] = -output[18] + output[19]; 1325 step[19] = output[19] + output[18]; 1326 step[20] = output[20] + output[21]; 1327 step[21] = -output[21] + output[20]; 1328 step[22] = -output[22] + output[23]; 1329 step[23] = output[23] + output[22]; 1330 step[24] = output[24] + output[25]; 1331 step[25] = -output[25] + output[24]; 1332 step[26] = -output[26] + output[27]; 1333 step[27] = output[27] + output[26]; 1334 step[28] = output[28] + output[29]; 1335 step[29] = -output[29] + output[28]; 1336 step[30] = -output[30] + output[31]; 1337 step[31] = output[31] + output[30]; 1338 1339 // Final stage --- outputs indices are bit-reversed. 1340 output[0] = step[0]; 1341 output[16] = step[1]; 1342 output[8] = step[2]; 1343 output[24] = step[3]; 1344 output[4] = step[4]; 1345 output[20] = step[5]; 1346 output[12] = step[6]; 1347 output[28] = step[7]; 1348 output[2] = step[8]; 1349 output[18] = step[9]; 1350 output[10] = step[10]; 1351 output[26] = step[11]; 1352 output[6] = step[12]; 1353 output[22] = step[13]; 1354 output[14] = step[14]; 1355 output[30] = step[15]; 1356 1357 output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); 1358 output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); 1359 output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); 1360 output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); 1361 output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); 1362 output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); 1363 output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); 1364 output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); 1365 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); 1366 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); 1367 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); 1368 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); 1369 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); 1370 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); 1371 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); 1372 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); 1373} 1374 1375void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { 1376 int r, c; 1377 tran_low_t sum = 0; 1378 for (r = 0; r < 32; ++r) 1379 for (c = 0; c < 32; ++c) 1380 sum += input[r * stride + c]; 1381 1382 output[0] = sum >> 3; 1383 output[1] = 0; 1384} 1385 1386void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { 1387 int i, j; 1388 tran_high_t output[32 * 32]; 1389 1390 // Columns 1391 for (i = 0; i < 32; ++i) { 1392 tran_high_t temp_in[32], temp_out[32]; 1393 for (j = 0; j < 32; ++j) 1394 temp_in[j] = input[j * stride + i] * 4; 1395 fdct32(temp_in, temp_out, 0); 1396 for (j = 0; j < 32; ++j) 1397 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1398 } 1399 1400 // Rows 1401 for (i = 0; i < 32; ++i) { 1402 tran_high_t temp_in[32], temp_out[32]; 1403 for (j = 0; j < 32; ++j) 1404 temp_in[j] = output[j + i * 32]; 1405 fdct32(temp_in, temp_out, 0); 1406 for (j = 0; j < 32; ++j) 1407 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 1408 } 1409} 1410 1411// Note that although we use dct_32_round in dct32 computation flow, 1412// this 2d fdct32x32 for rate-distortion optimization loop is operating 1413// within 16 bits precision. 1414void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { 1415 int i, j; 1416 tran_high_t output[32 * 32]; 1417 1418 // Columns 1419 for (i = 0; i < 32; ++i) { 1420 tran_high_t temp_in[32], temp_out[32]; 1421 for (j = 0; j < 32; ++j) 1422 temp_in[j] = input[j * stride + i] * 4; 1423 fdct32(temp_in, temp_out, 0); 1424 for (j = 0; j < 32; ++j) 1425 // TODO(cd): see quality impact of only doing 1426 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; 1427 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c 1428 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1429 } 1430 1431 // Rows 1432 for (i = 0; i < 32; ++i) { 1433 tran_high_t temp_in[32], temp_out[32]; 1434 for (j = 0; j < 32; ++j) 1435 temp_in[j] = output[j + i * 32]; 1436 fdct32(temp_in, temp_out, 1); 1437 for (j = 0; j < 32; ++j) 1438 out[j + i * 32] = temp_out[j]; 1439 } 1440} 1441 1442#if CONFIG_VP9_HIGHBITDEPTH 1443void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { 1444 vp9_fdct4x4_c(input, output, stride); 1445} 1446 1447void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output, 1448 int stride, int tx_type) { 1449 vp9_fht4x4_c(input, output, stride, tx_type); 1450} 1451 1452void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, 1453 int stride) { 1454 vp9_fdct8x8_1_c(input, final_output, stride); 1455} 1456 1457void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output, 1458 int stride) { 1459 vp9_fdct8x8_c(input, final_output, stride); 1460} 1461 1462void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output, 1463 int stride) { 1464 vp9_fdct16x16_1_c(input, output, stride); 1465} 1466 1467void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output, 1468 int stride) { 1469 vp9_fdct16x16_c(input, output, stride); 1470} 1471 1472void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output, 1473 int stride, int tx_type) { 1474 vp9_fht8x8_c(input, output, stride, tx_type); 1475} 1476 1477void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { 1478 vp9_fwht4x4_c(input, output, stride); 1479} 1480 1481void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output, 1482 int stride, int tx_type) { 1483 vp9_fht16x16_c(input, output, stride, tx_type); 1484} 1485 1486void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) { 1487 vp9_fdct32x32_1_c(input, out, stride); 1488} 1489 1490void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { 1491 vp9_fdct32x32_c(input, out, stride); 1492} 1493 1494void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, 1495 int stride) { 1496 vp9_fdct32x32_rd_c(input, out, stride); 1497} 1498#endif // CONFIG_VP9_HIGHBITDEPTH 1499