1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <math.h> 13 14#include "./vpx_config.h" 15#include "./vp9_rtcd.h" 16 17#include "vp9/common/vp9_blockd.h" 18#include "vp9/common/vp9_idct.h" 19#include "vp9/common/vp9_systemdependent.h" 20 21static INLINE int fdct_round_shift(int input) { 22 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 23 assert(INT16_MIN <= rv && rv <= INT16_MAX); 24 return rv; 25} 26 27static void fdct4(const int16_t *input, int16_t *output) { 28 int16_t step[4]; 29 int temp1, temp2; 30 31 step[0] = input[0] + input[3]; 32 step[1] = input[1] + input[2]; 33 step[2] = input[1] - input[2]; 34 step[3] = input[0] - input[3]; 35 36 temp1 = (step[0] + step[1]) * cospi_16_64; 37 temp2 = (step[0] - step[1]) * cospi_16_64; 38 output[0] = fdct_round_shift(temp1); 39 output[2] = fdct_round_shift(temp2); 40 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 41 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 42 output[1] = fdct_round_shift(temp1); 43 output[3] = fdct_round_shift(temp2); 44} 45 46void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { 47 // The 2D transform is done with two passes which are actually pretty 48 // similar. In the first one, we transform the columns and transpose 49 // the results. In the second one, we transform the rows. To achieve that, 50 // as the first pass results are transposed, we transpose the columns (that 51 // is the transposed rows) and transpose the results (so that it goes back 52 // in normal/row positions). 53 int pass; 54 // We need an intermediate buffer between passes. 55 int16_t intermediate[4 * 4]; 56 const int16_t *in = input; 57 int16_t *out = intermediate; 58 // Do the two transform/transpose passes 59 for (pass = 0; pass < 2; ++pass) { 60 /*canbe16*/ int input[4]; 61 /*canbe16*/ int step[4]; 62 /*needs32*/ int temp1, temp2; 63 int i; 64 for (i = 0; i < 4; ++i) { 65 // Load inputs. 66 if (0 == pass) { 67 input[0] = in[0 * stride] * 16; 68 input[1] = in[1 * stride] * 16; 69 input[2] = in[2 * stride] * 16; 70 input[3] = in[3 * stride] * 16; 71 if (i == 0 && input[0]) { 72 input[0] += 1; 73 } 74 } else { 75 input[0] = in[0 * 4]; 76 input[1] = in[1 * 4]; 77 input[2] = in[2 * 4]; 78 input[3] = in[3 * 4]; 79 } 80 // Transform. 81 step[0] = input[0] + input[3]; 82 step[1] = input[1] + input[2]; 83 step[2] = input[1] - input[2]; 84 step[3] = input[0] - input[3]; 85 temp1 = (step[0] + step[1]) * cospi_16_64; 86 temp2 = (step[0] - step[1]) * cospi_16_64; 87 out[0] = fdct_round_shift(temp1); 88 out[2] = fdct_round_shift(temp2); 89 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 90 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 91 out[1] = fdct_round_shift(temp1); 92 out[3] = fdct_round_shift(temp2); 93 // Do next column (which is a transposed row in second/horizontal pass) 94 in++; 95 out += 4; 96 } 97 // Setup in/out for next pass. 98 in = intermediate; 99 out = output; 100 } 101 102 { 103 int i, j; 104 for (i = 0; i < 4; ++i) { 105 for (j = 0; j < 4; ++j) 106 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; 107 } 108 } 109} 110 111static void fadst4(const int16_t *input, int16_t *output) { 112 int x0, x1, x2, x3; 113 int s0, s1, s2, s3, s4, s5, s6, s7; 114 115 x0 = input[0]; 116 x1 = input[1]; 117 x2 = input[2]; 118 x3 = input[3]; 119 120 if (!(x0 | x1 | x2 | x3)) { 121 output[0] = output[1] = output[2] = output[3] = 0; 122 return; 123 } 124 125 s0 = sinpi_1_9 * x0; 126 s1 = sinpi_4_9 * x0; 127 s2 = sinpi_2_9 * x1; 128 s3 = sinpi_1_9 * x1; 129 s4 = sinpi_3_9 * x2; 130 s5 = sinpi_4_9 * x3; 131 s6 = sinpi_2_9 * x3; 132 s7 = x0 + x1 - x3; 133 134 x0 = s0 + s2 + s5; 135 x1 = sinpi_3_9 * s7; 136 x2 = s1 - s3 + s6; 137 x3 = s4; 138 139 s0 = x0 + x3; 140 s1 = x1; 141 s2 = x2 - x3; 142 s3 = x2 - x0 + x3; 143 144 // 1-D transform scaling factor is sqrt(2). 145 output[0] = fdct_round_shift(s0); 146 output[1] = fdct_round_shift(s1); 147 output[2] = fdct_round_shift(s2); 148 output[3] = fdct_round_shift(s3); 149} 150 151static const transform_2d FHT_4[] = { 152 { fdct4, fdct4 }, // DCT_DCT = 0 153 { fadst4, fdct4 }, // ADST_DCT = 1 154 { fdct4, fadst4 }, // DCT_ADST = 2 155 { fadst4, fadst4 } // ADST_ADST = 3 156}; 157 158void vp9_fht4x4_c(const int16_t *input, int16_t *output, 159 int stride, int tx_type) { 160 if (tx_type == DCT_DCT) { 161 vp9_fdct4x4_c(input, output, stride); 162 } else { 163 int16_t out[4 * 4]; 164 int16_t *outptr = &out[0]; 165 int i, j; 166 int16_t temp_in[4], temp_out[4]; 167 const transform_2d ht = FHT_4[tx_type]; 168 169 // Columns 170 for (i = 0; i < 4; ++i) { 171 for (j = 0; j < 4; ++j) 172 temp_in[j] = input[j * stride + i] * 16; 173 if (i == 0 && temp_in[0]) 174 temp_in[0] += 1; 175 ht.cols(temp_in, temp_out); 176 for (j = 0; j < 4; ++j) 177 outptr[j * 4 + i] = temp_out[j]; 178 } 179 180 // Rows 181 for (i = 0; i < 4; ++i) { 182 for (j = 0; j < 4; ++j) 183 temp_in[j] = out[j + i * 4]; 184 ht.rows(temp_in, temp_out); 185 for (j = 0; j < 4; ++j) 186 output[j + i * 4] = (temp_out[j] + 1) >> 2; 187 } 188 } 189} 190 191static void fdct8(const int16_t *input, int16_t *output) { 192 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 193 /*needs32*/ int t0, t1, t2, t3; 194 /*canbe16*/ int x0, x1, x2, x3; 195 196 // stage 1 197 s0 = input[0] + input[7]; 198 s1 = input[1] + input[6]; 199 s2 = input[2] + input[5]; 200 s3 = input[3] + input[4]; 201 s4 = input[3] - input[4]; 202 s5 = input[2] - input[5]; 203 s6 = input[1] - input[6]; 204 s7 = input[0] - input[7]; 205 206 // fdct4(step, step); 207 x0 = s0 + s3; 208 x1 = s1 + s2; 209 x2 = s1 - s2; 210 x3 = s0 - s3; 211 t0 = (x0 + x1) * cospi_16_64; 212 t1 = (x0 - x1) * cospi_16_64; 213 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 214 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 215 output[0] = fdct_round_shift(t0); 216 output[2] = fdct_round_shift(t2); 217 output[4] = fdct_round_shift(t1); 218 output[6] = fdct_round_shift(t3); 219 220 // Stage 2 221 t0 = (s6 - s5) * cospi_16_64; 222 t1 = (s6 + s5) * cospi_16_64; 223 t2 = fdct_round_shift(t0); 224 t3 = fdct_round_shift(t1); 225 226 // Stage 3 227 x0 = s4 + t2; 228 x1 = s4 - t2; 229 x2 = s7 - t3; 230 x3 = s7 + t3; 231 232 // Stage 4 233 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 234 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 235 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 236 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 237 output[1] = fdct_round_shift(t0); 238 output[3] = fdct_round_shift(t2); 239 output[5] = fdct_round_shift(t1); 240 output[7] = fdct_round_shift(t3); 241} 242 243void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { 244 int i, j; 245 int16_t intermediate[64]; 246 247 // Transform columns 248 { 249 int16_t *output = intermediate; 250 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 251 /*needs32*/ int t0, t1, t2, t3; 252 /*canbe16*/ int x0, x1, x2, x3; 253 254 int i; 255 for (i = 0; i < 8; i++) { 256 // stage 1 257 s0 = (input[0 * stride] + input[7 * stride]) * 4; 258 s1 = (input[1 * stride] + input[6 * stride]) * 4; 259 s2 = (input[2 * stride] + input[5 * stride]) * 4; 260 s3 = (input[3 * stride] + input[4 * stride]) * 4; 261 s4 = (input[3 * stride] - input[4 * stride]) * 4; 262 s5 = (input[2 * stride] - input[5 * stride]) * 4; 263 s6 = (input[1 * stride] - input[6 * stride]) * 4; 264 s7 = (input[0 * stride] - input[7 * stride]) * 4; 265 266 // fdct4(step, step); 267 x0 = s0 + s3; 268 x1 = s1 + s2; 269 x2 = s1 - s2; 270 x3 = s0 - s3; 271 t0 = (x0 + x1) * cospi_16_64; 272 t1 = (x0 - x1) * cospi_16_64; 273 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 274 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 275 output[0 * 8] = fdct_round_shift(t0); 276 output[2 * 8] = fdct_round_shift(t2); 277 output[4 * 8] = fdct_round_shift(t1); 278 output[6 * 8] = fdct_round_shift(t3); 279 280 // Stage 2 281 t0 = (s6 - s5) * cospi_16_64; 282 t1 = (s6 + s5) * cospi_16_64; 283 t2 = fdct_round_shift(t0); 284 t3 = fdct_round_shift(t1); 285 286 // Stage 3 287 x0 = s4 + t2; 288 x1 = s4 - t2; 289 x2 = s7 - t3; 290 x3 = s7 + t3; 291 292 // Stage 4 293 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 294 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 295 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 296 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 297 output[1 * 8] = fdct_round_shift(t0); 298 output[3 * 8] = fdct_round_shift(t2); 299 output[5 * 8] = fdct_round_shift(t1); 300 output[7 * 8] = fdct_round_shift(t3); 301 input++; 302 output++; 303 } 304 } 305 306 // Rows 307 for (i = 0; i < 8; ++i) { 308 fdct8(&intermediate[i * 8], &final_output[i * 8]); 309 for (j = 0; j < 8; ++j) 310 final_output[j + i * 8] /= 2; 311 } 312} 313 314void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { 315 // The 2D transform is done with two passes which are actually pretty 316 // similar. In the first one, we transform the columns and transpose 317 // the results. In the second one, we transform the rows. To achieve that, 318 // as the first pass results are transposed, we transpose the columns (that 319 // is the transposed rows) and transpose the results (so that it goes back 320 // in normal/row positions). 321 int pass; 322 // We need an intermediate buffer between passes. 323 int16_t intermediate[256]; 324 const int16_t *in = input; 325 int16_t *out = intermediate; 326 // Do the two transform/transpose passes 327 for (pass = 0; pass < 2; ++pass) { 328 /*canbe16*/ int step1[8]; 329 /*canbe16*/ int step2[8]; 330 /*canbe16*/ int step3[8]; 331 /*canbe16*/ int input[8]; 332 /*needs32*/ int temp1, temp2; 333 int i; 334 for (i = 0; i < 16; i++) { 335 if (0 == pass) { 336 // Calculate input for the first 8 results. 337 input[0] = (in[0 * stride] + in[15 * stride]) * 4; 338 input[1] = (in[1 * stride] + in[14 * stride]) * 4; 339 input[2] = (in[2 * stride] + in[13 * stride]) * 4; 340 input[3] = (in[3 * stride] + in[12 * stride]) * 4; 341 input[4] = (in[4 * stride] + in[11 * stride]) * 4; 342 input[5] = (in[5 * stride] + in[10 * stride]) * 4; 343 input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; 344 input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; 345 // Calculate input for the next 8 results. 346 step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; 347 step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; 348 step1[2] = (in[5 * stride] - in[10 * stride]) * 4; 349 step1[3] = (in[4 * stride] - in[11 * stride]) * 4; 350 step1[4] = (in[3 * stride] - in[12 * stride]) * 4; 351 step1[5] = (in[2 * stride] - in[13 * stride]) * 4; 352 step1[6] = (in[1 * stride] - in[14 * stride]) * 4; 353 step1[7] = (in[0 * stride] - in[15 * stride]) * 4; 354 } else { 355 // Calculate input for the first 8 results. 356 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); 357 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); 358 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); 359 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); 360 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); 361 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); 362 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); 363 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); 364 // Calculate input for the next 8 results. 365 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); 366 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); 367 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); 368 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); 369 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); 370 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); 371 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); 372 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); 373 } 374 // Work on the first eight values; fdct8(input, even_results); 375 { 376 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 377 /*needs32*/ int t0, t1, t2, t3; 378 /*canbe16*/ int x0, x1, x2, x3; 379 380 // stage 1 381 s0 = input[0] + input[7]; 382 s1 = input[1] + input[6]; 383 s2 = input[2] + input[5]; 384 s3 = input[3] + input[4]; 385 s4 = input[3] - input[4]; 386 s5 = input[2] - input[5]; 387 s6 = input[1] - input[6]; 388 s7 = input[0] - input[7]; 389 390 // fdct4(step, step); 391 x0 = s0 + s3; 392 x1 = s1 + s2; 393 x2 = s1 - s2; 394 x3 = s0 - s3; 395 t0 = (x0 + x1) * cospi_16_64; 396 t1 = (x0 - x1) * cospi_16_64; 397 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 398 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 399 out[0] = fdct_round_shift(t0); 400 out[4] = fdct_round_shift(t2); 401 out[8] = fdct_round_shift(t1); 402 out[12] = fdct_round_shift(t3); 403 404 // Stage 2 405 t0 = (s6 - s5) * cospi_16_64; 406 t1 = (s6 + s5) * cospi_16_64; 407 t2 = fdct_round_shift(t0); 408 t3 = fdct_round_shift(t1); 409 410 // Stage 3 411 x0 = s4 + t2; 412 x1 = s4 - t2; 413 x2 = s7 - t3; 414 x3 = s7 + t3; 415 416 // Stage 4 417 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 418 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 419 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 420 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 421 out[2] = fdct_round_shift(t0); 422 out[6] = fdct_round_shift(t2); 423 out[10] = fdct_round_shift(t1); 424 out[14] = fdct_round_shift(t3); 425 } 426 // Work on the next eight values; step1 -> odd_results 427 { 428 // step 2 429 temp1 = (step1[5] - step1[2]) * cospi_16_64; 430 temp2 = (step1[4] - step1[3]) * cospi_16_64; 431 step2[2] = fdct_round_shift(temp1); 432 step2[3] = fdct_round_shift(temp2); 433 temp1 = (step1[4] + step1[3]) * cospi_16_64; 434 temp2 = (step1[5] + step1[2]) * cospi_16_64; 435 step2[4] = fdct_round_shift(temp1); 436 step2[5] = fdct_round_shift(temp2); 437 // step 3 438 step3[0] = step1[0] + step2[3]; 439 step3[1] = step1[1] + step2[2]; 440 step3[2] = step1[1] - step2[2]; 441 step3[3] = step1[0] - step2[3]; 442 step3[4] = step1[7] - step2[4]; 443 step3[5] = step1[6] - step2[5]; 444 step3[6] = step1[6] + step2[5]; 445 step3[7] = step1[7] + step2[4]; 446 // step 4 447 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 448 temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; 449 step2[1] = fdct_round_shift(temp1); 450 step2[2] = fdct_round_shift(temp2); 451 temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; 452 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 453 step2[5] = fdct_round_shift(temp1); 454 step2[6] = fdct_round_shift(temp2); 455 // step 5 456 step1[0] = step3[0] + step2[1]; 457 step1[1] = step3[0] - step2[1]; 458 step1[2] = step3[3] - step2[2]; 459 step1[3] = step3[3] + step2[2]; 460 step1[4] = step3[4] + step2[5]; 461 step1[5] = step3[4] - step2[5]; 462 step1[6] = step3[7] - step2[6]; 463 step1[7] = step3[7] + step2[6]; 464 // step 6 465 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 466 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 467 out[1] = fdct_round_shift(temp1); 468 out[9] = fdct_round_shift(temp2); 469 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 470 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 471 out[5] = fdct_round_shift(temp1); 472 out[13] = fdct_round_shift(temp2); 473 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 474 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 475 out[3] = fdct_round_shift(temp1); 476 out[11] = fdct_round_shift(temp2); 477 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 478 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 479 out[7] = fdct_round_shift(temp1); 480 out[15] = fdct_round_shift(temp2); 481 } 482 // Do next column (which is a transposed row in second/horizontal pass) 483 in++; 484 out += 16; 485 } 486 // Setup in/out for next pass. 487 in = intermediate; 488 out = output; 489 } 490} 491 492static void fadst8(const int16_t *input, int16_t *output) { 493 int s0, s1, s2, s3, s4, s5, s6, s7; 494 495 int x0 = input[7]; 496 int x1 = input[0]; 497 int x2 = input[5]; 498 int x3 = input[2]; 499 int x4 = input[3]; 500 int x5 = input[4]; 501 int x6 = input[1]; 502 int x7 = input[6]; 503 504 // stage 1 505 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 506 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 507 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 508 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 509 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 510 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 511 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 512 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 513 514 x0 = fdct_round_shift(s0 + s4); 515 x1 = fdct_round_shift(s1 + s5); 516 x2 = fdct_round_shift(s2 + s6); 517 x3 = fdct_round_shift(s3 + s7); 518 x4 = fdct_round_shift(s0 - s4); 519 x5 = fdct_round_shift(s1 - s5); 520 x6 = fdct_round_shift(s2 - s6); 521 x7 = fdct_round_shift(s3 - s7); 522 523 // stage 2 524 s0 = x0; 525 s1 = x1; 526 s2 = x2; 527 s3 = x3; 528 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 529 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 530 s6 = - cospi_24_64 * x6 + cospi_8_64 * x7; 531 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 532 533 x0 = s0 + s2; 534 x1 = s1 + s3; 535 x2 = s0 - s2; 536 x3 = s1 - s3; 537 x4 = fdct_round_shift(s4 + s6); 538 x5 = fdct_round_shift(s5 + s7); 539 x6 = fdct_round_shift(s4 - s6); 540 x7 = fdct_round_shift(s5 - s7); 541 542 // stage 3 543 s2 = cospi_16_64 * (x2 + x3); 544 s3 = cospi_16_64 * (x2 - x3); 545 s6 = cospi_16_64 * (x6 + x7); 546 s7 = cospi_16_64 * (x6 - x7); 547 548 x2 = fdct_round_shift(s2); 549 x3 = fdct_round_shift(s3); 550 x6 = fdct_round_shift(s6); 551 x7 = fdct_round_shift(s7); 552 553 output[0] = x0; 554 output[1] = - x4; 555 output[2] = x6; 556 output[3] = - x2; 557 output[4] = x3; 558 output[5] = - x7; 559 output[6] = x5; 560 output[7] = - x1; 561} 562 563static const transform_2d FHT_8[] = { 564 { fdct8, fdct8 }, // DCT_DCT = 0 565 { fadst8, fdct8 }, // ADST_DCT = 1 566 { fdct8, fadst8 }, // DCT_ADST = 2 567 { fadst8, fadst8 } // ADST_ADST = 3 568}; 569 570void vp9_fht8x8_c(const int16_t *input, int16_t *output, 571 int stride, int tx_type) { 572 if (tx_type == DCT_DCT) { 573 vp9_fdct8x8_c(input, output, stride); 574 } else { 575 int16_t out[64]; 576 int16_t *outptr = &out[0]; 577 int i, j; 578 int16_t temp_in[8], temp_out[8]; 579 const transform_2d ht = FHT_8[tx_type]; 580 581 // Columns 582 for (i = 0; i < 8; ++i) { 583 for (j = 0; j < 8; ++j) 584 temp_in[j] = input[j * stride + i] * 4; 585 ht.cols(temp_in, temp_out); 586 for (j = 0; j < 8; ++j) 587 outptr[j * 8 + i] = temp_out[j]; 588 } 589 590 // Rows 591 for (i = 0; i < 8; ++i) { 592 for (j = 0; j < 8; ++j) 593 temp_in[j] = out[j + i * 8]; 594 ht.rows(temp_in, temp_out); 595 for (j = 0; j < 8; ++j) 596 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; 597 } 598 } 599} 600 601/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per 602 pixel. */ 603void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { 604 int i; 605 int a1, b1, c1, d1, e1; 606 const int16_t *ip = input; 607 int16_t *op = output; 608 609 for (i = 0; i < 4; i++) { 610 a1 = ip[0 * stride]; 611 b1 = ip[1 * stride]; 612 c1 = ip[2 * stride]; 613 d1 = ip[3 * stride]; 614 615 a1 += b1; 616 d1 = d1 - c1; 617 e1 = (a1 - d1) >> 1; 618 b1 = e1 - b1; 619 c1 = e1 - c1; 620 a1 -= c1; 621 d1 += b1; 622 op[0] = a1; 623 op[4] = c1; 624 op[8] = d1; 625 op[12] = b1; 626 627 ip++; 628 op++; 629 } 630 ip = output; 631 op = output; 632 633 for (i = 0; i < 4; i++) { 634 a1 = ip[0]; 635 b1 = ip[1]; 636 c1 = ip[2]; 637 d1 = ip[3]; 638 639 a1 += b1; 640 d1 -= c1; 641 e1 = (a1 - d1) >> 1; 642 b1 = e1 - b1; 643 c1 = e1 - c1; 644 a1 -= c1; 645 d1 += b1; 646 op[0] = a1 * UNIT_QUANT_FACTOR; 647 op[1] = c1 * UNIT_QUANT_FACTOR; 648 op[2] = d1 * UNIT_QUANT_FACTOR; 649 op[3] = b1 * UNIT_QUANT_FACTOR; 650 651 ip += 4; 652 op += 4; 653 } 654} 655 656// Rewrote to use same algorithm as others. 657static void fdct16(const int16_t in[16], int16_t out[16]) { 658 /*canbe16*/ int step1[8]; 659 /*canbe16*/ int step2[8]; 660 /*canbe16*/ int step3[8]; 661 /*canbe16*/ int input[8]; 662 /*needs32*/ int temp1, temp2; 663 664 // step 1 665 input[0] = in[0] + in[15]; 666 input[1] = in[1] + in[14]; 667 input[2] = in[2] + in[13]; 668 input[3] = in[3] + in[12]; 669 input[4] = in[4] + in[11]; 670 input[5] = in[5] + in[10]; 671 input[6] = in[6] + in[ 9]; 672 input[7] = in[7] + in[ 8]; 673 674 step1[0] = in[7] - in[ 8]; 675 step1[1] = in[6] - in[ 9]; 676 step1[2] = in[5] - in[10]; 677 step1[3] = in[4] - in[11]; 678 step1[4] = in[3] - in[12]; 679 step1[5] = in[2] - in[13]; 680 step1[6] = in[1] - in[14]; 681 step1[7] = in[0] - in[15]; 682 683 // fdct8(step, step); 684 { 685 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 686 /*needs32*/ int t0, t1, t2, t3; 687 /*canbe16*/ int x0, x1, x2, x3; 688 689 // stage 1 690 s0 = input[0] + input[7]; 691 s1 = input[1] + input[6]; 692 s2 = input[2] + input[5]; 693 s3 = input[3] + input[4]; 694 s4 = input[3] - input[4]; 695 s5 = input[2] - input[5]; 696 s6 = input[1] - input[6]; 697 s7 = input[0] - input[7]; 698 699 // fdct4(step, step); 700 x0 = s0 + s3; 701 x1 = s1 + s2; 702 x2 = s1 - s2; 703 x3 = s0 - s3; 704 t0 = (x0 + x1) * cospi_16_64; 705 t1 = (x0 - x1) * cospi_16_64; 706 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 707 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 708 out[0] = fdct_round_shift(t0); 709 out[4] = fdct_round_shift(t2); 710 out[8] = fdct_round_shift(t1); 711 out[12] = fdct_round_shift(t3); 712 713 // Stage 2 714 t0 = (s6 - s5) * cospi_16_64; 715 t1 = (s6 + s5) * cospi_16_64; 716 t2 = fdct_round_shift(t0); 717 t3 = fdct_round_shift(t1); 718 719 // Stage 3 720 x0 = s4 + t2; 721 x1 = s4 - t2; 722 x2 = s7 - t3; 723 x3 = s7 + t3; 724 725 // Stage 4 726 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 727 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 728 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 729 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 730 out[2] = fdct_round_shift(t0); 731 out[6] = fdct_round_shift(t2); 732 out[10] = fdct_round_shift(t1); 733 out[14] = fdct_round_shift(t3); 734 } 735 736 // step 2 737 temp1 = (step1[5] - step1[2]) * cospi_16_64; 738 temp2 = (step1[4] - step1[3]) * cospi_16_64; 739 step2[2] = fdct_round_shift(temp1); 740 step2[3] = fdct_round_shift(temp2); 741 temp1 = (step1[4] + step1[3]) * cospi_16_64; 742 temp2 = (step1[5] + step1[2]) * cospi_16_64; 743 step2[4] = fdct_round_shift(temp1); 744 step2[5] = fdct_round_shift(temp2); 745 746 // step 3 747 step3[0] = step1[0] + step2[3]; 748 step3[1] = step1[1] + step2[2]; 749 step3[2] = step1[1] - step2[2]; 750 step3[3] = step1[0] - step2[3]; 751 step3[4] = step1[7] - step2[4]; 752 step3[5] = step1[6] - step2[5]; 753 step3[6] = step1[6] + step2[5]; 754 step3[7] = step1[7] + step2[4]; 755 756 // step 4 757 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 758 temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; 759 step2[1] = fdct_round_shift(temp1); 760 step2[2] = fdct_round_shift(temp2); 761 temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; 762 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 763 step2[5] = fdct_round_shift(temp1); 764 step2[6] = fdct_round_shift(temp2); 765 766 // step 5 767 step1[0] = step3[0] + step2[1]; 768 step1[1] = step3[0] - step2[1]; 769 step1[2] = step3[3] - step2[2]; 770 step1[3] = step3[3] + step2[2]; 771 step1[4] = step3[4] + step2[5]; 772 step1[5] = step3[4] - step2[5]; 773 step1[6] = step3[7] - step2[6]; 774 step1[7] = step3[7] + step2[6]; 775 776 // step 6 777 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 778 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 779 out[1] = fdct_round_shift(temp1); 780 out[9] = fdct_round_shift(temp2); 781 782 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 783 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 784 out[5] = fdct_round_shift(temp1); 785 out[13] = fdct_round_shift(temp2); 786 787 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 788 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 789 out[3] = fdct_round_shift(temp1); 790 out[11] = fdct_round_shift(temp2); 791 792 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 793 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 794 out[7] = fdct_round_shift(temp1); 795 out[15] = fdct_round_shift(temp2); 796} 797 798static void fadst16(const int16_t *input, int16_t *output) { 799 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; 800 801 int x0 = input[15]; 802 int x1 = input[0]; 803 int x2 = input[13]; 804 int x3 = input[2]; 805 int x4 = input[11]; 806 int x5 = input[4]; 807 int x6 = input[9]; 808 int x7 = input[6]; 809 int x8 = input[7]; 810 int x9 = input[8]; 811 int x10 = input[5]; 812 int x11 = input[10]; 813 int x12 = input[3]; 814 int x13 = input[12]; 815 int x14 = input[1]; 816 int x15 = input[14]; 817 818 // stage 1 819 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 820 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 821 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 822 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 823 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 824 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 825 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 826 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 827 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 828 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 829 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 830 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 831 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 832 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 833 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 834 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 835 836 x0 = fdct_round_shift(s0 + s8); 837 x1 = fdct_round_shift(s1 + s9); 838 x2 = fdct_round_shift(s2 + s10); 839 x3 = fdct_round_shift(s3 + s11); 840 x4 = fdct_round_shift(s4 + s12); 841 x5 = fdct_round_shift(s5 + s13); 842 x6 = fdct_round_shift(s6 + s14); 843 x7 = fdct_round_shift(s7 + s15); 844 x8 = fdct_round_shift(s0 - s8); 845 x9 = fdct_round_shift(s1 - s9); 846 x10 = fdct_round_shift(s2 - s10); 847 x11 = fdct_round_shift(s3 - s11); 848 x12 = fdct_round_shift(s4 - s12); 849 x13 = fdct_round_shift(s5 - s13); 850 x14 = fdct_round_shift(s6 - s14); 851 x15 = fdct_round_shift(s7 - s15); 852 853 // stage 2 854 s0 = x0; 855 s1 = x1; 856 s2 = x2; 857 s3 = x3; 858 s4 = x4; 859 s5 = x5; 860 s6 = x6; 861 s7 = x7; 862 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 863 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 864 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 865 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 866 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 867 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 868 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 869 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 870 871 x0 = s0 + s4; 872 x1 = s1 + s5; 873 x2 = s2 + s6; 874 x3 = s3 + s7; 875 x4 = s0 - s4; 876 x5 = s1 - s5; 877 x6 = s2 - s6; 878 x7 = s3 - s7; 879 x8 = fdct_round_shift(s8 + s12); 880 x9 = fdct_round_shift(s9 + s13); 881 x10 = fdct_round_shift(s10 + s14); 882 x11 = fdct_round_shift(s11 + s15); 883 x12 = fdct_round_shift(s8 - s12); 884 x13 = fdct_round_shift(s9 - s13); 885 x14 = fdct_round_shift(s10 - s14); 886 x15 = fdct_round_shift(s11 - s15); 887 888 // stage 3 889 s0 = x0; 890 s1 = x1; 891 s2 = x2; 892 s3 = x3; 893 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 894 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 895 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 896 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 897 s8 = x8; 898 s9 = x9; 899 s10 = x10; 900 s11 = x11; 901 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 902 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 903 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 904 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 905 906 x0 = s0 + s2; 907 x1 = s1 + s3; 908 x2 = s0 - s2; 909 x3 = s1 - s3; 910 x4 = fdct_round_shift(s4 + s6); 911 x5 = fdct_round_shift(s5 + s7); 912 x6 = fdct_round_shift(s4 - s6); 913 x7 = fdct_round_shift(s5 - s7); 914 x8 = s8 + s10; 915 x9 = s9 + s11; 916 x10 = s8 - s10; 917 x11 = s9 - s11; 918 x12 = fdct_round_shift(s12 + s14); 919 x13 = fdct_round_shift(s13 + s15); 920 x14 = fdct_round_shift(s12 - s14); 921 x15 = fdct_round_shift(s13 - s15); 922 923 // stage 4 924 s2 = (- cospi_16_64) * (x2 + x3); 925 s3 = cospi_16_64 * (x2 - x3); 926 s6 = cospi_16_64 * (x6 + x7); 927 s7 = cospi_16_64 * (- x6 + x7); 928 s10 = cospi_16_64 * (x10 + x11); 929 s11 = cospi_16_64 * (- x10 + x11); 930 s14 = (- cospi_16_64) * (x14 + x15); 931 s15 = cospi_16_64 * (x14 - x15); 932 933 x2 = fdct_round_shift(s2); 934 x3 = fdct_round_shift(s3); 935 x6 = fdct_round_shift(s6); 936 x7 = fdct_round_shift(s7); 937 x10 = fdct_round_shift(s10); 938 x11 = fdct_round_shift(s11); 939 x14 = fdct_round_shift(s14); 940 x15 = fdct_round_shift(s15); 941 942 output[0] = x0; 943 output[1] = - x8; 944 output[2] = x12; 945 output[3] = - x4; 946 output[4] = x6; 947 output[5] = x14; 948 output[6] = x10; 949 output[7] = x2; 950 output[8] = x3; 951 output[9] = x11; 952 output[10] = x15; 953 output[11] = x7; 954 output[12] = x5; 955 output[13] = - x13; 956 output[14] = x9; 957 output[15] = - x1; 958} 959 960static const transform_2d FHT_16[] = { 961 { fdct16, fdct16 }, // DCT_DCT = 0 962 { fadst16, fdct16 }, // ADST_DCT = 1 963 { fdct16, fadst16 }, // DCT_ADST = 2 964 { fadst16, fadst16 } // ADST_ADST = 3 965}; 966 967void vp9_fht16x16_c(const int16_t *input, int16_t *output, 968 int stride, int tx_type) { 969 if (tx_type == DCT_DCT) { 970 vp9_fdct16x16_c(input, output, stride); 971 } else { 972 int16_t out[256]; 973 int16_t *outptr = &out[0]; 974 int i, j; 975 int16_t temp_in[16], temp_out[16]; 976 const transform_2d ht = FHT_16[tx_type]; 977 978 // Columns 979 for (i = 0; i < 16; ++i) { 980 for (j = 0; j < 16; ++j) 981 temp_in[j] = input[j * stride + i] * 4; 982 ht.cols(temp_in, temp_out); 983 for (j = 0; j < 16; ++j) 984 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 985 } 986 987 // Rows 988 for (i = 0; i < 16; ++i) { 989 for (j = 0; j < 16; ++j) 990 temp_in[j] = out[j + i * 16]; 991 ht.rows(temp_in, temp_out); 992 for (j = 0; j < 16; ++j) 993 output[j + i * 16] = temp_out[j]; 994 } 995 } 996} 997 998static INLINE int dct_32_round(int input) { 999 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 1000 assert(-131072 <= rv && rv <= 131071); 1001 return rv; 1002} 1003 1004static INLINE int half_round_shift(int input) { 1005 int rv = (input + 1 + (input < 0)) >> 2; 1006 return rv; 1007} 1008 1009static void fdct32(const int *input, int *output, int round) { 1010 int step[32]; 1011 // Stage 1 1012 step[0] = input[0] + input[(32 - 1)]; 1013 step[1] = input[1] + input[(32 - 2)]; 1014 step[2] = input[2] + input[(32 - 3)]; 1015 step[3] = input[3] + input[(32 - 4)]; 1016 step[4] = input[4] + input[(32 - 5)]; 1017 step[5] = input[5] + input[(32 - 6)]; 1018 step[6] = input[6] + input[(32 - 7)]; 1019 step[7] = input[7] + input[(32 - 8)]; 1020 step[8] = input[8] + input[(32 - 9)]; 1021 step[9] = input[9] + input[(32 - 10)]; 1022 step[10] = input[10] + input[(32 - 11)]; 1023 step[11] = input[11] + input[(32 - 12)]; 1024 step[12] = input[12] + input[(32 - 13)]; 1025 step[13] = input[13] + input[(32 - 14)]; 1026 step[14] = input[14] + input[(32 - 15)]; 1027 step[15] = input[15] + input[(32 - 16)]; 1028 step[16] = -input[16] + input[(32 - 17)]; 1029 step[17] = -input[17] + input[(32 - 18)]; 1030 step[18] = -input[18] + input[(32 - 19)]; 1031 step[19] = -input[19] + input[(32 - 20)]; 1032 step[20] = -input[20] + input[(32 - 21)]; 1033 step[21] = -input[21] + input[(32 - 22)]; 1034 step[22] = -input[22] + input[(32 - 23)]; 1035 step[23] = -input[23] + input[(32 - 24)]; 1036 step[24] = -input[24] + input[(32 - 25)]; 1037 step[25] = -input[25] + input[(32 - 26)]; 1038 step[26] = -input[26] + input[(32 - 27)]; 1039 step[27] = -input[27] + input[(32 - 28)]; 1040 step[28] = -input[28] + input[(32 - 29)]; 1041 step[29] = -input[29] + input[(32 - 30)]; 1042 step[30] = -input[30] + input[(32 - 31)]; 1043 step[31] = -input[31] + input[(32 - 32)]; 1044 1045 // Stage 2 1046 output[0] = step[0] + step[16 - 1]; 1047 output[1] = step[1] + step[16 - 2]; 1048 output[2] = step[2] + step[16 - 3]; 1049 output[3] = step[3] + step[16 - 4]; 1050 output[4] = step[4] + step[16 - 5]; 1051 output[5] = step[5] + step[16 - 6]; 1052 output[6] = step[6] + step[16 - 7]; 1053 output[7] = step[7] + step[16 - 8]; 1054 output[8] = -step[8] + step[16 - 9]; 1055 output[9] = -step[9] + step[16 - 10]; 1056 output[10] = -step[10] + step[16 - 11]; 1057 output[11] = -step[11] + step[16 - 12]; 1058 output[12] = -step[12] + step[16 - 13]; 1059 output[13] = -step[13] + step[16 - 14]; 1060 output[14] = -step[14] + step[16 - 15]; 1061 output[15] = -step[15] + step[16 - 16]; 1062 1063 output[16] = step[16]; 1064 output[17] = step[17]; 1065 output[18] = step[18]; 1066 output[19] = step[19]; 1067 1068 output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); 1069 output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); 1070 output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); 1071 output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); 1072 1073 output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); 1074 output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); 1075 output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); 1076 output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); 1077 1078 output[28] = step[28]; 1079 output[29] = step[29]; 1080 output[30] = step[30]; 1081 output[31] = step[31]; 1082 1083 // dump the magnitude by 4, hence the intermediate values are within 1084 // the range of 16 bits. 1085 if (round) { 1086 output[0] = half_round_shift(output[0]); 1087 output[1] = half_round_shift(output[1]); 1088 output[2] = half_round_shift(output[2]); 1089 output[3] = half_round_shift(output[3]); 1090 output[4] = half_round_shift(output[4]); 1091 output[5] = half_round_shift(output[5]); 1092 output[6] = half_round_shift(output[6]); 1093 output[7] = half_round_shift(output[7]); 1094 output[8] = half_round_shift(output[8]); 1095 output[9] = half_round_shift(output[9]); 1096 output[10] = half_round_shift(output[10]); 1097 output[11] = half_round_shift(output[11]); 1098 output[12] = half_round_shift(output[12]); 1099 output[13] = half_round_shift(output[13]); 1100 output[14] = half_round_shift(output[14]); 1101 output[15] = half_round_shift(output[15]); 1102 1103 output[16] = half_round_shift(output[16]); 1104 output[17] = half_round_shift(output[17]); 1105 output[18] = half_round_shift(output[18]); 1106 output[19] = half_round_shift(output[19]); 1107 output[20] = half_round_shift(output[20]); 1108 output[21] = half_round_shift(output[21]); 1109 output[22] = half_round_shift(output[22]); 1110 output[23] = half_round_shift(output[23]); 1111 output[24] = half_round_shift(output[24]); 1112 output[25] = half_round_shift(output[25]); 1113 output[26] = half_round_shift(output[26]); 1114 output[27] = half_round_shift(output[27]); 1115 output[28] = half_round_shift(output[28]); 1116 output[29] = half_round_shift(output[29]); 1117 output[30] = half_round_shift(output[30]); 1118 output[31] = half_round_shift(output[31]); 1119 } 1120 1121 // Stage 3 1122 step[0] = output[0] + output[(8 - 1)]; 1123 step[1] = output[1] + output[(8 - 2)]; 1124 step[2] = output[2] + output[(8 - 3)]; 1125 step[3] = output[3] + output[(8 - 4)]; 1126 step[4] = -output[4] + output[(8 - 5)]; 1127 step[5] = -output[5] + output[(8 - 6)]; 1128 step[6] = -output[6] + output[(8 - 7)]; 1129 step[7] = -output[7] + output[(8 - 8)]; 1130 step[8] = output[8]; 1131 step[9] = output[9]; 1132 step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); 1133 step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); 1134 step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); 1135 step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); 1136 step[14] = output[14]; 1137 step[15] = output[15]; 1138 1139 step[16] = output[16] + output[23]; 1140 step[17] = output[17] + output[22]; 1141 step[18] = output[18] + output[21]; 1142 step[19] = output[19] + output[20]; 1143 step[20] = -output[20] + output[19]; 1144 step[21] = -output[21] + output[18]; 1145 step[22] = -output[22] + output[17]; 1146 step[23] = -output[23] + output[16]; 1147 step[24] = -output[24] + output[31]; 1148 step[25] = -output[25] + output[30]; 1149 step[26] = -output[26] + output[29]; 1150 step[27] = -output[27] + output[28]; 1151 step[28] = output[28] + output[27]; 1152 step[29] = output[29] + output[26]; 1153 step[30] = output[30] + output[25]; 1154 step[31] = output[31] + output[24]; 1155 1156 // Stage 4 1157 output[0] = step[0] + step[3]; 1158 output[1] = step[1] + step[2]; 1159 output[2] = -step[2] + step[1]; 1160 output[3] = -step[3] + step[0]; 1161 output[4] = step[4]; 1162 output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); 1163 output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); 1164 output[7] = step[7]; 1165 output[8] = step[8] + step[11]; 1166 output[9] = step[9] + step[10]; 1167 output[10] = -step[10] + step[9]; 1168 output[11] = -step[11] + step[8]; 1169 output[12] = -step[12] + step[15]; 1170 output[13] = -step[13] + step[14]; 1171 output[14] = step[14] + step[13]; 1172 output[15] = step[15] + step[12]; 1173 1174 output[16] = step[16]; 1175 output[17] = step[17]; 1176 output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); 1177 output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); 1178 output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); 1179 output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); 1180 output[22] = step[22]; 1181 output[23] = step[23]; 1182 output[24] = step[24]; 1183 output[25] = step[25]; 1184 output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); 1185 output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); 1186 output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); 1187 output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); 1188 output[30] = step[30]; 1189 output[31] = step[31]; 1190 1191 // Stage 5 1192 step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); 1193 step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); 1194 step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); 1195 step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); 1196 step[4] = output[4] + output[5]; 1197 step[5] = -output[5] + output[4]; 1198 step[6] = -output[6] + output[7]; 1199 step[7] = output[7] + output[6]; 1200 step[8] = output[8]; 1201 step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); 1202 step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); 1203 step[11] = output[11]; 1204 step[12] = output[12]; 1205 step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); 1206 step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); 1207 step[15] = output[15]; 1208 1209 step[16] = output[16] + output[19]; 1210 step[17] = output[17] + output[18]; 1211 step[18] = -output[18] + output[17]; 1212 step[19] = -output[19] + output[16]; 1213 step[20] = -output[20] + output[23]; 1214 step[21] = -output[21] + output[22]; 1215 step[22] = output[22] + output[21]; 1216 step[23] = output[23] + output[20]; 1217 step[24] = output[24] + output[27]; 1218 step[25] = output[25] + output[26]; 1219 step[26] = -output[26] + output[25]; 1220 step[27] = -output[27] + output[24]; 1221 step[28] = -output[28] + output[31]; 1222 step[29] = -output[29] + output[30]; 1223 step[30] = output[30] + output[29]; 1224 step[31] = output[31] + output[28]; 1225 1226 // Stage 6 1227 output[0] = step[0]; 1228 output[1] = step[1]; 1229 output[2] = step[2]; 1230 output[3] = step[3]; 1231 output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); 1232 output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); 1233 output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); 1234 output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); 1235 output[8] = step[8] + step[9]; 1236 output[9] = -step[9] + step[8]; 1237 output[10] = -step[10] + step[11]; 1238 output[11] = step[11] + step[10]; 1239 output[12] = step[12] + step[13]; 1240 output[13] = -step[13] + step[12]; 1241 output[14] = -step[14] + step[15]; 1242 output[15] = step[15] + step[14]; 1243 1244 output[16] = step[16]; 1245 output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); 1246 output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); 1247 output[19] = step[19]; 1248 output[20] = step[20]; 1249 output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); 1250 output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); 1251 output[23] = step[23]; 1252 output[24] = step[24]; 1253 output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); 1254 output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); 1255 output[27] = step[27]; 1256 output[28] = step[28]; 1257 output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); 1258 output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); 1259 output[31] = step[31]; 1260 1261 // Stage 7 1262 step[0] = output[0]; 1263 step[1] = output[1]; 1264 step[2] = output[2]; 1265 step[3] = output[3]; 1266 step[4] = output[4]; 1267 step[5] = output[5]; 1268 step[6] = output[6]; 1269 step[7] = output[7]; 1270 step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); 1271 step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); 1272 step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); 1273 step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); 1274 step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); 1275 step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); 1276 step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); 1277 step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); 1278 1279 step[16] = output[16] + output[17]; 1280 step[17] = -output[17] + output[16]; 1281 step[18] = -output[18] + output[19]; 1282 step[19] = output[19] + output[18]; 1283 step[20] = output[20] + output[21]; 1284 step[21] = -output[21] + output[20]; 1285 step[22] = -output[22] + output[23]; 1286 step[23] = output[23] + output[22]; 1287 step[24] = output[24] + output[25]; 1288 step[25] = -output[25] + output[24]; 1289 step[26] = -output[26] + output[27]; 1290 step[27] = output[27] + output[26]; 1291 step[28] = output[28] + output[29]; 1292 step[29] = -output[29] + output[28]; 1293 step[30] = -output[30] + output[31]; 1294 step[31] = output[31] + output[30]; 1295 1296 // Final stage --- outputs indices are bit-reversed. 1297 output[0] = step[0]; 1298 output[16] = step[1]; 1299 output[8] = step[2]; 1300 output[24] = step[3]; 1301 output[4] = step[4]; 1302 output[20] = step[5]; 1303 output[12] = step[6]; 1304 output[28] = step[7]; 1305 output[2] = step[8]; 1306 output[18] = step[9]; 1307 output[10] = step[10]; 1308 output[26] = step[11]; 1309 output[6] = step[12]; 1310 output[22] = step[13]; 1311 output[14] = step[14]; 1312 output[30] = step[15]; 1313 1314 output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); 1315 output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); 1316 output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); 1317 output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); 1318 output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); 1319 output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); 1320 output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); 1321 output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); 1322 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); 1323 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); 1324 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); 1325 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); 1326 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); 1327 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); 1328 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); 1329 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); 1330} 1331 1332void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { 1333 int i, j; 1334 int output[32 * 32]; 1335 1336 // Columns 1337 for (i = 0; i < 32; ++i) { 1338 int temp_in[32], temp_out[32]; 1339 for (j = 0; j < 32; ++j) 1340 temp_in[j] = input[j * stride + i] * 4; 1341 fdct32(temp_in, temp_out, 0); 1342 for (j = 0; j < 32; ++j) 1343 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1344 } 1345 1346 // Rows 1347 for (i = 0; i < 32; ++i) { 1348 int temp_in[32], temp_out[32]; 1349 for (j = 0; j < 32; ++j) 1350 temp_in[j] = output[j + i * 32]; 1351 fdct32(temp_in, temp_out, 0); 1352 for (j = 0; j < 32; ++j) 1353 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 1354 } 1355} 1356 1357// Note that although we use dct_32_round in dct32 computation flow, 1358// this 2d fdct32x32 for rate-distortion optimization loop is operating 1359// within 16 bits precision. 1360void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { 1361 int i, j; 1362 int output[32 * 32]; 1363 1364 // Columns 1365 for (i = 0; i < 32; ++i) { 1366 int temp_in[32], temp_out[32]; 1367 for (j = 0; j < 32; ++j) 1368 temp_in[j] = input[j * stride + i] * 4; 1369 fdct32(temp_in, temp_out, 0); 1370 for (j = 0; j < 32; ++j) 1371 // TODO(cd): see quality impact of only doing 1372 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; 1373 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c 1374 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1375 } 1376 1377 // Rows 1378 for (i = 0; i < 32; ++i) { 1379 int temp_in[32], temp_out[32]; 1380 for (j = 0; j < 32; ++j) 1381 temp_in[j] = output[j + i * 32]; 1382 fdct32(temp_in, temp_out, 1); 1383 for (j = 0; j < 32; ++j) 1384 out[j + i * 32] = temp_out[j]; 1385 } 1386} 1387