1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12#include <assert.h> 13#include <math.h> 14#include "./vpx_config.h" 15#include "vp9/common/vp9_systemdependent.h" 16 17#include "vp9/common/vp9_blockd.h" 18#include "vp9/common/vp9_idct.h" 19 20static void fdct4_1d(int16_t *input, int16_t *output) { 21 int16_t step[4]; 22 int temp1, temp2; 23 24 step[0] = input[0] + input[3]; 25 step[1] = input[1] + input[2]; 26 step[2] = input[1] - input[2]; 27 step[3] = input[0] - input[3]; 28 29 temp1 = (step[0] + step[1]) * cospi_16_64; 30 temp2 = (step[0] - step[1]) * cospi_16_64; 31 output[0] = dct_const_round_shift(temp1); 32 output[2] = dct_const_round_shift(temp2); 33 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 34 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 35 output[1] = dct_const_round_shift(temp1); 36 output[3] = dct_const_round_shift(temp2); 37} 38 39void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) { 40 // The 2D transform is done with two passes which are actually pretty 41 // similar. In the first one, we transform the columns and transpose 42 // the results. In the second one, we transform the rows. To achieve that, 43 // as the first pass results are transposed, we tranpose the columns (that 44 // is the transposed rows) and transpose the results (so that it goes back 45 // in normal/row positions). 46 const int stride = pitch >> 1; 47 int pass; 48 // We need an intermediate buffer between passes. 49 int16_t intermediate[4 * 4]; 50 int16_t *in = input; 51 int16_t *out = intermediate; 52 // Do the two transform/transpose passes 53 for (pass = 0; pass < 2; ++pass) { 54 /*canbe16*/ int input[4]; 55 /*canbe16*/ int step[4]; 56 /*needs32*/ int temp1, temp2; 57 int i; 58 for (i = 0; i < 4; ++i) { 59 // Load inputs. 60 if (0 == pass) { 61 input[0] = in[0 * stride] << 4; 62 input[1] = in[1 * stride] << 4; 63 input[2] = in[2 * stride] << 4; 64 input[3] = in[3 * stride] << 4; 65 if (i == 0 && input[0]) { 66 input[0] += 1; 67 } 68 } else { 69 input[0] = in[0 * 4]; 70 input[1] = in[1 * 4]; 71 input[2] = in[2 * 4]; 72 input[3] = in[3 * 4]; 73 } 74 // Transform. 75 step[0] = input[0] + input[3]; 76 step[1] = input[1] + input[2]; 77 step[2] = input[1] - input[2]; 78 step[3] = input[0] - input[3]; 79 temp1 = (step[0] + step[1]) * cospi_16_64; 80 temp2 = (step[0] - step[1]) * cospi_16_64; 81 out[0] = dct_const_round_shift(temp1); 82 out[2] = dct_const_round_shift(temp2); 83 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 84 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 85 out[1] = dct_const_round_shift(temp1); 86 out[3] = dct_const_round_shift(temp2); 87 // Do next column (which is a transposed row in second/horizontal pass) 88 in++; 89 out += 4; 90 } 91 // Setup in/out for next pass. 92 in = intermediate; 93 out = output; 94 } 95 96 { 97 int i, j; 98 for (i = 0; i < 4; ++i) { 99 for (j = 0; j < 4; ++j) 100 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; 101 } 102 } 103} 104 105static void fadst4_1d(int16_t *input, int16_t *output) { 106 int x0, x1, x2, x3; 107 int s0, s1, s2, s3, s4, s5, s6, s7; 108 109 x0 = input[0]; 110 x1 = input[1]; 111 x2 = input[2]; 112 x3 = input[3]; 113 114 if (!(x0 | x1 | x2 | x3)) { 115 output[0] = output[1] = output[2] = output[3] = 0; 116 return; 117 } 118 119 s0 = sinpi_1_9 * x0; 120 s1 = sinpi_4_9 * x0; 121 s2 = sinpi_2_9 * x1; 122 s3 = sinpi_1_9 * x1; 123 s4 = sinpi_3_9 * x2; 124 s5 = sinpi_4_9 * x3; 125 s6 = sinpi_2_9 * x3; 126 s7 = x0 + x1 - x3; 127 128 x0 = s0 + s2 + s5; 129 x1 = sinpi_3_9 * s7; 130 x2 = s1 - s3 + s6; 131 x3 = s4; 132 133 s0 = x0 + x3; 134 s1 = x1; 135 s2 = x2 - x3; 136 s3 = x2 - x0 + x3; 137 138 // 1-D transform scaling factor is sqrt(2). 139 output[0] = dct_const_round_shift(s0); 140 output[1] = dct_const_round_shift(s1); 141 output[2] = dct_const_round_shift(s2); 142 output[3] = dct_const_round_shift(s3); 143} 144 145static const transform_2d FHT_4[] = { 146 { fdct4_1d, fdct4_1d }, // DCT_DCT = 0 147 { fadst4_1d, fdct4_1d }, // ADST_DCT = 1 148 { fdct4_1d, fadst4_1d }, // DCT_ADST = 2 149 { fadst4_1d, fadst4_1d } // ADST_ADST = 3 150}; 151 152void vp9_short_fht4x4_c(int16_t *input, int16_t *output, 153 int pitch, TX_TYPE tx_type) { 154 int16_t out[4 * 4]; 155 int16_t *outptr = &out[0]; 156 int i, j; 157 int16_t temp_in[4], temp_out[4]; 158 const transform_2d ht = FHT_4[tx_type]; 159 160 // Columns 161 for (i = 0; i < 4; ++i) { 162 for (j = 0; j < 4; ++j) 163 temp_in[j] = input[j * pitch + i] << 4; 164 if (i == 0 && temp_in[0]) 165 temp_in[0] += 1; 166 ht.cols(temp_in, temp_out); 167 for (j = 0; j < 4; ++j) 168 outptr[j * 4 + i] = temp_out[j]; 169 } 170 171 // Rows 172 for (i = 0; i < 4; ++i) { 173 for (j = 0; j < 4; ++j) 174 temp_in[j] = out[j + i * 4]; 175 ht.rows(temp_in, temp_out); 176 for (j = 0; j < 4; ++j) 177 output[j + i * 4] = (temp_out[j] + 1) >> 2; 178 } 179} 180 181void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) { 182 vp9_short_fdct4x4_c(input, output, pitch); 183 vp9_short_fdct4x4_c(input + 4, output + 16, pitch); 184} 185 186static void fdct8_1d(int16_t *input, int16_t *output) { 187 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 188 /*needs32*/ int t0, t1, t2, t3; 189 /*canbe16*/ int x0, x1, x2, x3; 190 191 // stage 1 192 s0 = input[0] + input[7]; 193 s1 = input[1] + input[6]; 194 s2 = input[2] + input[5]; 195 s3 = input[3] + input[4]; 196 s4 = input[3] - input[4]; 197 s5 = input[2] - input[5]; 198 s6 = input[1] - input[6]; 199 s7 = input[0] - input[7]; 200 201 // fdct4_1d(step, step); 202 x0 = s0 + s3; 203 x1 = s1 + s2; 204 x2 = s1 - s2; 205 x3 = s0 - s3; 206 t0 = (x0 + x1) * cospi_16_64; 207 t1 = (x0 - x1) * cospi_16_64; 208 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 209 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 210 output[0] = dct_const_round_shift(t0); 211 output[2] = dct_const_round_shift(t2); 212 output[4] = dct_const_round_shift(t1); 213 output[6] = dct_const_round_shift(t3); 214 215 // Stage 2 216 t0 = (s6 - s5) * cospi_16_64; 217 t1 = (s6 + s5) * cospi_16_64; 218 t2 = dct_const_round_shift(t0); 219 t3 = dct_const_round_shift(t1); 220 221 // Stage 3 222 x0 = s4 + t2; 223 x1 = s4 - t2; 224 x2 = s7 - t3; 225 x3 = s7 + t3; 226 227 // Stage 4 228 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 229 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 230 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 231 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 232 output[1] = dct_const_round_shift(t0); 233 output[3] = dct_const_round_shift(t2); 234 output[5] = dct_const_round_shift(t1); 235 output[7] = dct_const_round_shift(t3); 236} 237 238void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { 239 const int stride = pitch >> 1; 240 int i, j; 241 int16_t intermediate[64]; 242 243 // Transform columns 244 { 245 int16_t *output = intermediate; 246 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 247 /*needs32*/ int t0, t1, t2, t3; 248 /*canbe16*/ int x0, x1, x2, x3; 249 250 int i; 251 for (i = 0; i < 8; i++) { 252 // stage 1 253 s0 = (input[0 * stride] + input[7 * stride]) << 2; 254 s1 = (input[1 * stride] + input[6 * stride]) << 2; 255 s2 = (input[2 * stride] + input[5 * stride]) << 2; 256 s3 = (input[3 * stride] + input[4 * stride]) << 2; 257 s4 = (input[3 * stride] - input[4 * stride]) << 2; 258 s5 = (input[2 * stride] - input[5 * stride]) << 2; 259 s6 = (input[1 * stride] - input[6 * stride]) << 2; 260 s7 = (input[0 * stride] - input[7 * stride]) << 2; 261 262 // fdct4_1d(step, step); 263 x0 = s0 + s3; 264 x1 = s1 + s2; 265 x2 = s1 - s2; 266 x3 = s0 - s3; 267 t0 = (x0 + x1) * cospi_16_64; 268 t1 = (x0 - x1) * cospi_16_64; 269 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 270 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 271 output[0 * 8] = dct_const_round_shift(t0); 272 output[2 * 8] = dct_const_round_shift(t2); 273 output[4 * 8] = dct_const_round_shift(t1); 274 output[6 * 8] = dct_const_round_shift(t3); 275 276 // Stage 2 277 t0 = (s6 - s5) * cospi_16_64; 278 t1 = (s6 + s5) * cospi_16_64; 279 t2 = dct_const_round_shift(t0); 280 t3 = dct_const_round_shift(t1); 281 282 // Stage 3 283 x0 = s4 + t2; 284 x1 = s4 - t2; 285 x2 = s7 - t3; 286 x3 = s7 + t3; 287 288 // Stage 4 289 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 290 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 291 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 292 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 293 output[1 * 8] = dct_const_round_shift(t0); 294 output[3 * 8] = dct_const_round_shift(t2); 295 output[5 * 8] = dct_const_round_shift(t1); 296 output[7 * 8] = dct_const_round_shift(t3); 297 input++; 298 output++; 299 } 300 } 301 302 // Rows 303 for (i = 0; i < 8; ++i) { 304 fdct8_1d(&intermediate[i * 8], &final_output[i * 8]); 305 for (j = 0; j < 8; ++j) 306 final_output[j + i * 8] /= 2; 307 } 308} 309 310void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { 311 // The 2D transform is done with two passes which are actually pretty 312 // similar. In the first one, we transform the columns and transpose 313 // the results. In the second one, we transform the rows. To achieve that, 314 // as the first pass results are transposed, we tranpose the columns (that 315 // is the transposed rows) and transpose the results (so that it goes back 316 // in normal/row positions). 317 const int stride = pitch >> 1; 318 int pass; 319 // We need an intermediate buffer between passes. 320 int16_t intermediate[256]; 321 int16_t *in = input; 322 int16_t *out = intermediate; 323 // Do the two transform/transpose passes 324 for (pass = 0; pass < 2; ++pass) { 325 /*canbe16*/ int step1[8]; 326 /*canbe16*/ int step2[8]; 327 /*canbe16*/ int step3[8]; 328 /*canbe16*/ int input[8]; 329 /*needs32*/ int temp1, temp2; 330 int i; 331 for (i = 0; i < 16; i++) { 332 if (0 == pass) { 333 // Calculate input for the first 8 results. 334 input[0] = (in[0 * stride] + in[15 * stride]) << 2; 335 input[1] = (in[1 * stride] + in[14 * stride]) << 2; 336 input[2] = (in[2 * stride] + in[13 * stride]) << 2; 337 input[3] = (in[3 * stride] + in[12 * stride]) << 2; 338 input[4] = (in[4 * stride] + in[11 * stride]) << 2; 339 input[5] = (in[5 * stride] + in[10 * stride]) << 2; 340 input[6] = (in[6 * stride] + in[ 9 * stride]) << 2; 341 input[7] = (in[7 * stride] + in[ 8 * stride]) << 2; 342 // Calculate input for the next 8 results. 343 step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2; 344 step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2; 345 step1[2] = (in[5 * stride] - in[10 * stride]) << 2; 346 step1[3] = (in[4 * stride] - in[11 * stride]) << 2; 347 step1[4] = (in[3 * stride] - in[12 * stride]) << 2; 348 step1[5] = (in[2 * stride] - in[13 * stride]) << 2; 349 step1[6] = (in[1 * stride] - in[14 * stride]) << 2; 350 step1[7] = (in[0 * stride] - in[15 * stride]) << 2; 351 } else { 352 // Calculate input for the first 8 results. 353 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); 354 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); 355 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); 356 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); 357 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); 358 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); 359 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); 360 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); 361 // Calculate input for the next 8 results. 362 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); 363 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); 364 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); 365 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); 366 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); 367 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); 368 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); 369 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); 370 } 371 // Work on the first eight values; fdct8_1d(input, even_results); 372 { 373 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 374 /*needs32*/ int t0, t1, t2, t3; 375 /*canbe16*/ int x0, x1, x2, x3; 376 377 // stage 1 378 s0 = input[0] + input[7]; 379 s1 = input[1] + input[6]; 380 s2 = input[2] + input[5]; 381 s3 = input[3] + input[4]; 382 s4 = input[3] - input[4]; 383 s5 = input[2] - input[5]; 384 s6 = input[1] - input[6]; 385 s7 = input[0] - input[7]; 386 387 // fdct4_1d(step, step); 388 x0 = s0 + s3; 389 x1 = s1 + s2; 390 x2 = s1 - s2; 391 x3 = s0 - s3; 392 t0 = (x0 + x1) * cospi_16_64; 393 t1 = (x0 - x1) * cospi_16_64; 394 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 395 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 396 out[0] = dct_const_round_shift(t0); 397 out[4] = dct_const_round_shift(t2); 398 out[8] = dct_const_round_shift(t1); 399 out[12] = dct_const_round_shift(t3); 400 401 // Stage 2 402 t0 = (s6 - s5) * cospi_16_64; 403 t1 = (s6 + s5) * cospi_16_64; 404 t2 = dct_const_round_shift(t0); 405 t3 = dct_const_round_shift(t1); 406 407 // Stage 3 408 x0 = s4 + t2; 409 x1 = s4 - t2; 410 x2 = s7 - t3; 411 x3 = s7 + t3; 412 413 // Stage 4 414 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 415 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 416 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 417 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 418 out[2] = dct_const_round_shift(t0); 419 out[6] = dct_const_round_shift(t2); 420 out[10] = dct_const_round_shift(t1); 421 out[14] = dct_const_round_shift(t3); 422 } 423 // Work on the next eight values; step1 -> odd_results 424 { 425 // step 2 426 temp1 = (step1[5] - step1[2]) * cospi_16_64; 427 temp2 = (step1[4] - step1[3]) * cospi_16_64; 428 step2[2] = dct_const_round_shift(temp1); 429 step2[3] = dct_const_round_shift(temp2); 430 temp1 = (step1[4] + step1[3]) * cospi_16_64; 431 temp2 = (step1[5] + step1[2]) * cospi_16_64; 432 step2[4] = dct_const_round_shift(temp1); 433 step2[5] = dct_const_round_shift(temp2); 434 // step 3 435 step3[0] = step1[0] + step2[3]; 436 step3[1] = step1[1] + step2[2]; 437 step3[2] = step1[1] - step2[2]; 438 step3[3] = step1[0] - step2[3]; 439 step3[4] = step1[7] - step2[4]; 440 step3[5] = step1[6] - step2[5]; 441 step3[6] = step1[6] + step2[5]; 442 step3[7] = step1[7] + step2[4]; 443 // step 4 444 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 445 temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; 446 step2[1] = dct_const_round_shift(temp1); 447 step2[2] = dct_const_round_shift(temp2); 448 temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; 449 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 450 step2[5] = dct_const_round_shift(temp1); 451 step2[6] = dct_const_round_shift(temp2); 452 // step 5 453 step1[0] = step3[0] + step2[1]; 454 step1[1] = step3[0] - step2[1]; 455 step1[2] = step3[3] - step2[2]; 456 step1[3] = step3[3] + step2[2]; 457 step1[4] = step3[4] + step2[5]; 458 step1[5] = step3[4] - step2[5]; 459 step1[6] = step3[7] - step2[6]; 460 step1[7] = step3[7] + step2[6]; 461 // step 6 462 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 463 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 464 out[1] = dct_const_round_shift(temp1); 465 out[9] = dct_const_round_shift(temp2); 466 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 467 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 468 out[5] = dct_const_round_shift(temp1); 469 out[13] = dct_const_round_shift(temp2); 470 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 471 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 472 out[3] = dct_const_round_shift(temp1); 473 out[11] = dct_const_round_shift(temp2); 474 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 475 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 476 out[7] = dct_const_round_shift(temp1); 477 out[15] = dct_const_round_shift(temp2); 478 } 479 // Do next column (which is a transposed row in second/horizontal pass) 480 in++; 481 out += 16; 482 } 483 // Setup in/out for next pass. 484 in = intermediate; 485 out = output; 486 } 487} 488 489static void fadst8_1d(int16_t *input, int16_t *output) { 490 int s0, s1, s2, s3, s4, s5, s6, s7; 491 492 int x0 = input[7]; 493 int x1 = input[0]; 494 int x2 = input[5]; 495 int x3 = input[2]; 496 int x4 = input[3]; 497 int x5 = input[4]; 498 int x6 = input[1]; 499 int x7 = input[6]; 500 501 // stage 1 502 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 503 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 504 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 505 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 506 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 507 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 508 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 509 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 510 511 x0 = dct_const_round_shift(s0 + s4); 512 x1 = dct_const_round_shift(s1 + s5); 513 x2 = dct_const_round_shift(s2 + s6); 514 x3 = dct_const_round_shift(s3 + s7); 515 x4 = dct_const_round_shift(s0 - s4); 516 x5 = dct_const_round_shift(s1 - s5); 517 x6 = dct_const_round_shift(s2 - s6); 518 x7 = dct_const_round_shift(s3 - s7); 519 520 // stage 2 521 s0 = x0; 522 s1 = x1; 523 s2 = x2; 524 s3 = x3; 525 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 526 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 527 s6 = - cospi_24_64 * x6 + cospi_8_64 * x7; 528 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 529 530 x0 = s0 + s2; 531 x1 = s1 + s3; 532 x2 = s0 - s2; 533 x3 = s1 - s3; 534 x4 = dct_const_round_shift(s4 + s6); 535 x5 = dct_const_round_shift(s5 + s7); 536 x6 = dct_const_round_shift(s4 - s6); 537 x7 = dct_const_round_shift(s5 - s7); 538 539 // stage 3 540 s2 = cospi_16_64 * (x2 + x3); 541 s3 = cospi_16_64 * (x2 - x3); 542 s6 = cospi_16_64 * (x6 + x7); 543 s7 = cospi_16_64 * (x6 - x7); 544 545 x2 = dct_const_round_shift(s2); 546 x3 = dct_const_round_shift(s3); 547 x6 = dct_const_round_shift(s6); 548 x7 = dct_const_round_shift(s7); 549 550 output[0] = x0; 551 output[1] = - x4; 552 output[2] = x6; 553 output[3] = - x2; 554 output[4] = x3; 555 output[5] = - x7; 556 output[6] = x5; 557 output[7] = - x1; 558} 559 560static const transform_2d FHT_8[] = { 561 { fdct8_1d, fdct8_1d }, // DCT_DCT = 0 562 { fadst8_1d, fdct8_1d }, // ADST_DCT = 1 563 { fdct8_1d, fadst8_1d }, // DCT_ADST = 2 564 { fadst8_1d, fadst8_1d } // ADST_ADST = 3 565}; 566 567void vp9_short_fht8x8_c(int16_t *input, int16_t *output, 568 int pitch, TX_TYPE tx_type) { 569 int16_t out[64]; 570 int16_t *outptr = &out[0]; 571 int i, j; 572 int16_t temp_in[8], temp_out[8]; 573 const transform_2d ht = FHT_8[tx_type]; 574 575 // Columns 576 for (i = 0; i < 8; ++i) { 577 for (j = 0; j < 8; ++j) 578 temp_in[j] = input[j * pitch + i] << 2; 579 ht.cols(temp_in, temp_out); 580 for (j = 0; j < 8; ++j) 581 outptr[j * 8 + i] = temp_out[j]; 582 } 583 584 // Rows 585 for (i = 0; i < 8; ++i) { 586 for (j = 0; j < 8; ++j) 587 temp_in[j] = out[j + i * 8]; 588 ht.rows(temp_in, temp_out); 589 for (j = 0; j < 8; ++j) 590 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; 591 } 592} 593 594/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per 595 pixel. */ 596void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { 597 int i; 598 int a1, b1, c1, d1, e1; 599 short *ip = input; 600 short *op = output; 601 int pitch_short = pitch >> 1; 602 603 for (i = 0; i < 4; i++) { 604 a1 = ip[0 * pitch_short]; 605 b1 = ip[1 * pitch_short]; 606 c1 = ip[2 * pitch_short]; 607 d1 = ip[3 * pitch_short]; 608 609 a1 += b1; 610 d1 = d1 - c1; 611 e1 = (a1 - d1) >> 1; 612 b1 = e1 - b1; 613 c1 = e1 - c1; 614 a1 -= c1; 615 d1 += b1; 616 op[0] = a1; 617 op[4] = c1; 618 op[8] = d1; 619 op[12] = b1; 620 621 ip++; 622 op++; 623 } 624 ip = output; 625 op = output; 626 627 for (i = 0; i < 4; i++) { 628 a1 = ip[0]; 629 b1 = ip[1]; 630 c1 = ip[2]; 631 d1 = ip[3]; 632 633 a1 += b1; 634 d1 -= c1; 635 e1 = (a1 - d1) >> 1; 636 b1 = e1 - b1; 637 c1 = e1 - c1; 638 a1 -= c1; 639 d1 += b1; 640 op[0] = a1 << WHT_UPSCALE_FACTOR; 641 op[1] = c1 << WHT_UPSCALE_FACTOR; 642 op[2] = d1 << WHT_UPSCALE_FACTOR; 643 op[3] = b1 << WHT_UPSCALE_FACTOR; 644 645 ip += 4; 646 op += 4; 647 } 648} 649 650void vp9_short_walsh8x4_c(short *input, short *output, int pitch) { 651 vp9_short_walsh4x4_c(input, output, pitch); 652 vp9_short_walsh4x4_c(input + 4, output + 16, pitch); 653} 654 655 656// Rewrote to use same algorithm as others. 657static void fdct16_1d(int16_t in[16], int16_t out[16]) { 658 /*canbe16*/ int step1[8]; 659 /*canbe16*/ int step2[8]; 660 /*canbe16*/ int step3[8]; 661 /*canbe16*/ int input[8]; 662 /*needs32*/ int temp1, temp2; 663 664 // step 1 665 input[0] = in[0] + in[15]; 666 input[1] = in[1] + in[14]; 667 input[2] = in[2] + in[13]; 668 input[3] = in[3] + in[12]; 669 input[4] = in[4] + in[11]; 670 input[5] = in[5] + in[10]; 671 input[6] = in[6] + in[ 9]; 672 input[7] = in[7] + in[ 8]; 673 674 step1[0] = in[7] - in[ 8]; 675 step1[1] = in[6] - in[ 9]; 676 step1[2] = in[5] - in[10]; 677 step1[3] = in[4] - in[11]; 678 step1[4] = in[3] - in[12]; 679 step1[5] = in[2] - in[13]; 680 step1[6] = in[1] - in[14]; 681 step1[7] = in[0] - in[15]; 682 683 // fdct8_1d(step, step); 684 { 685 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 686 /*needs32*/ int t0, t1, t2, t3; 687 /*canbe16*/ int x0, x1, x2, x3; 688 689 // stage 1 690 s0 = input[0] + input[7]; 691 s1 = input[1] + input[6]; 692 s2 = input[2] + input[5]; 693 s3 = input[3] + input[4]; 694 s4 = input[3] - input[4]; 695 s5 = input[2] - input[5]; 696 s6 = input[1] - input[6]; 697 s7 = input[0] - input[7]; 698 699 // fdct4_1d(step, step); 700 x0 = s0 + s3; 701 x1 = s1 + s2; 702 x2 = s1 - s2; 703 x3 = s0 - s3; 704 t0 = (x0 + x1) * cospi_16_64; 705 t1 = (x0 - x1) * cospi_16_64; 706 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 707 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 708 out[0] = dct_const_round_shift(t0); 709 out[4] = dct_const_round_shift(t2); 710 out[8] = dct_const_round_shift(t1); 711 out[12] = dct_const_round_shift(t3); 712 713 // Stage 2 714 t0 = (s6 - s5) * cospi_16_64; 715 t1 = (s6 + s5) * cospi_16_64; 716 t2 = dct_const_round_shift(t0); 717 t3 = dct_const_round_shift(t1); 718 719 // Stage 3 720 x0 = s4 + t2; 721 x1 = s4 - t2; 722 x2 = s7 - t3; 723 x3 = s7 + t3; 724 725 // Stage 4 726 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 727 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 728 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 729 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 730 out[2] = dct_const_round_shift(t0); 731 out[6] = dct_const_round_shift(t2); 732 out[10] = dct_const_round_shift(t1); 733 out[14] = dct_const_round_shift(t3); 734 } 735 736 // step 2 737 temp1 = (step1[5] - step1[2]) * cospi_16_64; 738 temp2 = (step1[4] - step1[3]) * cospi_16_64; 739 step2[2] = dct_const_round_shift(temp1); 740 step2[3] = dct_const_round_shift(temp2); 741 temp1 = (step1[4] + step1[3]) * cospi_16_64; 742 temp2 = (step1[5] + step1[2]) * cospi_16_64; 743 step2[4] = dct_const_round_shift(temp1); 744 step2[5] = dct_const_round_shift(temp2); 745 746 // step 3 747 step3[0] = step1[0] + step2[3]; 748 step3[1] = step1[1] + step2[2]; 749 step3[2] = step1[1] - step2[2]; 750 step3[3] = step1[0] - step2[3]; 751 step3[4] = step1[7] - step2[4]; 752 step3[5] = step1[6] - step2[5]; 753 step3[6] = step1[6] + step2[5]; 754 step3[7] = step1[7] + step2[4]; 755 756 // step 4 757 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 758 temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; 759 step2[1] = dct_const_round_shift(temp1); 760 step2[2] = dct_const_round_shift(temp2); 761 temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; 762 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 763 step2[5] = dct_const_round_shift(temp1); 764 step2[6] = dct_const_round_shift(temp2); 765 766 // step 5 767 step1[0] = step3[0] + step2[1]; 768 step1[1] = step3[0] - step2[1]; 769 step1[2] = step3[3] - step2[2]; 770 step1[3] = step3[3] + step2[2]; 771 step1[4] = step3[4] + step2[5]; 772 step1[5] = step3[4] - step2[5]; 773 step1[6] = step3[7] - step2[6]; 774 step1[7] = step3[7] + step2[6]; 775 776 // step 6 777 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 778 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 779 out[1] = dct_const_round_shift(temp1); 780 out[9] = dct_const_round_shift(temp2); 781 782 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 783 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 784 out[5] = dct_const_round_shift(temp1); 785 out[13] = dct_const_round_shift(temp2); 786 787 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 788 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 789 out[3] = dct_const_round_shift(temp1); 790 out[11] = dct_const_round_shift(temp2); 791 792 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 793 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 794 out[7] = dct_const_round_shift(temp1); 795 out[15] = dct_const_round_shift(temp2); 796} 797 798void fadst16_1d(int16_t *input, int16_t *output) { 799 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; 800 801 int x0 = input[15]; 802 int x1 = input[0]; 803 int x2 = input[13]; 804 int x3 = input[2]; 805 int x4 = input[11]; 806 int x5 = input[4]; 807 int x6 = input[9]; 808 int x7 = input[6]; 809 int x8 = input[7]; 810 int x9 = input[8]; 811 int x10 = input[5]; 812 int x11 = input[10]; 813 int x12 = input[3]; 814 int x13 = input[12]; 815 int x14 = input[1]; 816 int x15 = input[14]; 817 818 // stage 1 819 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 820 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 821 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 822 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 823 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 824 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 825 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 826 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 827 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 828 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 829 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 830 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 831 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 832 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 833 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 834 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 835 836 x0 = dct_const_round_shift(s0 + s8); 837 x1 = dct_const_round_shift(s1 + s9); 838 x2 = dct_const_round_shift(s2 + s10); 839 x3 = dct_const_round_shift(s3 + s11); 840 x4 = dct_const_round_shift(s4 + s12); 841 x5 = dct_const_round_shift(s5 + s13); 842 x6 = dct_const_round_shift(s6 + s14); 843 x7 = dct_const_round_shift(s7 + s15); 844 x8 = dct_const_round_shift(s0 - s8); 845 x9 = dct_const_round_shift(s1 - s9); 846 x10 = dct_const_round_shift(s2 - s10); 847 x11 = dct_const_round_shift(s3 - s11); 848 x12 = dct_const_round_shift(s4 - s12); 849 x13 = dct_const_round_shift(s5 - s13); 850 x14 = dct_const_round_shift(s6 - s14); 851 x15 = dct_const_round_shift(s7 - s15); 852 853 // stage 2 854 s0 = x0; 855 s1 = x1; 856 s2 = x2; 857 s3 = x3; 858 s4 = x4; 859 s5 = x5; 860 s6 = x6; 861 s7 = x7; 862 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 863 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 864 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 865 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 866 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 867 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 868 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 869 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 870 871 x0 = s0 + s4; 872 x1 = s1 + s5; 873 x2 = s2 + s6; 874 x3 = s3 + s7; 875 x4 = s0 - s4; 876 x5 = s1 - s5; 877 x6 = s2 - s6; 878 x7 = s3 - s7; 879 x8 = dct_const_round_shift(s8 + s12); 880 x9 = dct_const_round_shift(s9 + s13); 881 x10 = dct_const_round_shift(s10 + s14); 882 x11 = dct_const_round_shift(s11 + s15); 883 x12 = dct_const_round_shift(s8 - s12); 884 x13 = dct_const_round_shift(s9 - s13); 885 x14 = dct_const_round_shift(s10 - s14); 886 x15 = dct_const_round_shift(s11 - s15); 887 888 // stage 3 889 s0 = x0; 890 s1 = x1; 891 s2 = x2; 892 s3 = x3; 893 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 894 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 895 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 896 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 897 s8 = x8; 898 s9 = x9; 899 s10 = x10; 900 s11 = x11; 901 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 902 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 903 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 904 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 905 906 x0 = s0 + s2; 907 x1 = s1 + s3; 908 x2 = s0 - s2; 909 x3 = s1 - s3; 910 x4 = dct_const_round_shift(s4 + s6); 911 x5 = dct_const_round_shift(s5 + s7); 912 x6 = dct_const_round_shift(s4 - s6); 913 x7 = dct_const_round_shift(s5 - s7); 914 x8 = s8 + s10; 915 x9 = s9 + s11; 916 x10 = s8 - s10; 917 x11 = s9 - s11; 918 x12 = dct_const_round_shift(s12 + s14); 919 x13 = dct_const_round_shift(s13 + s15); 920 x14 = dct_const_round_shift(s12 - s14); 921 x15 = dct_const_round_shift(s13 - s15); 922 923 // stage 4 924 s2 = (- cospi_16_64) * (x2 + x3); 925 s3 = cospi_16_64 * (x2 - x3); 926 s6 = cospi_16_64 * (x6 + x7); 927 s7 = cospi_16_64 * (- x6 + x7); 928 s10 = cospi_16_64 * (x10 + x11); 929 s11 = cospi_16_64 * (- x10 + x11); 930 s14 = (- cospi_16_64) * (x14 + x15); 931 s15 = cospi_16_64 * (x14 - x15); 932 933 x2 = dct_const_round_shift(s2); 934 x3 = dct_const_round_shift(s3); 935 x6 = dct_const_round_shift(s6); 936 x7 = dct_const_round_shift(s7); 937 x10 = dct_const_round_shift(s10); 938 x11 = dct_const_round_shift(s11); 939 x14 = dct_const_round_shift(s14); 940 x15 = dct_const_round_shift(s15); 941 942 output[0] = x0; 943 output[1] = - x8; 944 output[2] = x12; 945 output[3] = - x4; 946 output[4] = x6; 947 output[5] = x14; 948 output[6] = x10; 949 output[7] = x2; 950 output[8] = x3; 951 output[9] = x11; 952 output[10] = x15; 953 output[11] = x7; 954 output[12] = x5; 955 output[13] = - x13; 956 output[14] = x9; 957 output[15] = - x1; 958} 959 960static const transform_2d FHT_16[] = { 961 { fdct16_1d, fdct16_1d }, // DCT_DCT = 0 962 { fadst16_1d, fdct16_1d }, // ADST_DCT = 1 963 { fdct16_1d, fadst16_1d }, // DCT_ADST = 2 964 { fadst16_1d, fadst16_1d } // ADST_ADST = 3 965}; 966 967void vp9_short_fht16x16_c(int16_t *input, int16_t *output, 968 int pitch, TX_TYPE tx_type) { 969 int16_t out[256]; 970 int16_t *outptr = &out[0]; 971 int i, j; 972 int16_t temp_in[16], temp_out[16]; 973 const transform_2d ht = FHT_16[tx_type]; 974 975 // Columns 976 for (i = 0; i < 16; ++i) { 977 for (j = 0; j < 16; ++j) 978 temp_in[j] = input[j * pitch + i] << 2; 979 ht.cols(temp_in, temp_out); 980 for (j = 0; j < 16; ++j) 981 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 982// outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 983 } 984 985 // Rows 986 for (i = 0; i < 16; ++i) { 987 for (j = 0; j < 16; ++j) 988 temp_in[j] = out[j + i * 16]; 989 ht.rows(temp_in, temp_out); 990 for (j = 0; j < 16; ++j) 991 output[j + i * 16] = temp_out[j]; 992 } 993} 994 995static INLINE int dct_32_round(int input) { 996 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 997 assert(-131072 <= rv && rv <= 131071); 998 return rv; 999} 1000 1001static INLINE int half_round_shift(int input) { 1002 int rv = (input + 1 + (input < 0)) >> 2; 1003 return rv; 1004} 1005 1006static void dct32_1d(int *input, int *output, int round) { 1007 int step[32]; 1008 // Stage 1 1009 step[0] = input[0] + input[(32 - 1)]; 1010 step[1] = input[1] + input[(32 - 2)]; 1011 step[2] = input[2] + input[(32 - 3)]; 1012 step[3] = input[3] + input[(32 - 4)]; 1013 step[4] = input[4] + input[(32 - 5)]; 1014 step[5] = input[5] + input[(32 - 6)]; 1015 step[6] = input[6] + input[(32 - 7)]; 1016 step[7] = input[7] + input[(32 - 8)]; 1017 step[8] = input[8] + input[(32 - 9)]; 1018 step[9] = input[9] + input[(32 - 10)]; 1019 step[10] = input[10] + input[(32 - 11)]; 1020 step[11] = input[11] + input[(32 - 12)]; 1021 step[12] = input[12] + input[(32 - 13)]; 1022 step[13] = input[13] + input[(32 - 14)]; 1023 step[14] = input[14] + input[(32 - 15)]; 1024 step[15] = input[15] + input[(32 - 16)]; 1025 step[16] = -input[16] + input[(32 - 17)]; 1026 step[17] = -input[17] + input[(32 - 18)]; 1027 step[18] = -input[18] + input[(32 - 19)]; 1028 step[19] = -input[19] + input[(32 - 20)]; 1029 step[20] = -input[20] + input[(32 - 21)]; 1030 step[21] = -input[21] + input[(32 - 22)]; 1031 step[22] = -input[22] + input[(32 - 23)]; 1032 step[23] = -input[23] + input[(32 - 24)]; 1033 step[24] = -input[24] + input[(32 - 25)]; 1034 step[25] = -input[25] + input[(32 - 26)]; 1035 step[26] = -input[26] + input[(32 - 27)]; 1036 step[27] = -input[27] + input[(32 - 28)]; 1037 step[28] = -input[28] + input[(32 - 29)]; 1038 step[29] = -input[29] + input[(32 - 30)]; 1039 step[30] = -input[30] + input[(32 - 31)]; 1040 step[31] = -input[31] + input[(32 - 32)]; 1041 1042 // Stage 2 1043 output[0] = step[0] + step[16 - 1]; 1044 output[1] = step[1] + step[16 - 2]; 1045 output[2] = step[2] + step[16 - 3]; 1046 output[3] = step[3] + step[16 - 4]; 1047 output[4] = step[4] + step[16 - 5]; 1048 output[5] = step[5] + step[16 - 6]; 1049 output[6] = step[6] + step[16 - 7]; 1050 output[7] = step[7] + step[16 - 8]; 1051 output[8] = -step[8] + step[16 - 9]; 1052 output[9] = -step[9] + step[16 - 10]; 1053 output[10] = -step[10] + step[16 - 11]; 1054 output[11] = -step[11] + step[16 - 12]; 1055 output[12] = -step[12] + step[16 - 13]; 1056 output[13] = -step[13] + step[16 - 14]; 1057 output[14] = -step[14] + step[16 - 15]; 1058 output[15] = -step[15] + step[16 - 16]; 1059 1060 output[16] = step[16]; 1061 output[17] = step[17]; 1062 output[18] = step[18]; 1063 output[19] = step[19]; 1064 1065 output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); 1066 output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); 1067 output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); 1068 output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); 1069 1070 output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); 1071 output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); 1072 output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); 1073 output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); 1074 1075 output[28] = step[28]; 1076 output[29] = step[29]; 1077 output[30] = step[30]; 1078 output[31] = step[31]; 1079 1080 // dump the magnitude by 4, hence the intermediate values are within 1081 // the range of 16 bits. 1082 if (round) { 1083 output[0] = half_round_shift(output[0]); 1084 output[1] = half_round_shift(output[1]); 1085 output[2] = half_round_shift(output[2]); 1086 output[3] = half_round_shift(output[3]); 1087 output[4] = half_round_shift(output[4]); 1088 output[5] = half_round_shift(output[5]); 1089 output[6] = half_round_shift(output[6]); 1090 output[7] = half_round_shift(output[7]); 1091 output[8] = half_round_shift(output[8]); 1092 output[9] = half_round_shift(output[9]); 1093 output[10] = half_round_shift(output[10]); 1094 output[11] = half_round_shift(output[11]); 1095 output[12] = half_round_shift(output[12]); 1096 output[13] = half_round_shift(output[13]); 1097 output[14] = half_round_shift(output[14]); 1098 output[15] = half_round_shift(output[15]); 1099 1100 output[16] = half_round_shift(output[16]); 1101 output[17] = half_round_shift(output[17]); 1102 output[18] = half_round_shift(output[18]); 1103 output[19] = half_round_shift(output[19]); 1104 output[20] = half_round_shift(output[20]); 1105 output[21] = half_round_shift(output[21]); 1106 output[22] = half_round_shift(output[22]); 1107 output[23] = half_round_shift(output[23]); 1108 output[24] = half_round_shift(output[24]); 1109 output[25] = half_round_shift(output[25]); 1110 output[26] = half_round_shift(output[26]); 1111 output[27] = half_round_shift(output[27]); 1112 output[28] = half_round_shift(output[28]); 1113 output[29] = half_round_shift(output[29]); 1114 output[30] = half_round_shift(output[30]); 1115 output[31] = half_round_shift(output[31]); 1116 } 1117 1118 // Stage 3 1119 step[0] = output[0] + output[(8 - 1)]; 1120 step[1] = output[1] + output[(8 - 2)]; 1121 step[2] = output[2] + output[(8 - 3)]; 1122 step[3] = output[3] + output[(8 - 4)]; 1123 step[4] = -output[4] + output[(8 - 5)]; 1124 step[5] = -output[5] + output[(8 - 6)]; 1125 step[6] = -output[6] + output[(8 - 7)]; 1126 step[7] = -output[7] + output[(8 - 8)]; 1127 step[8] = output[8]; 1128 step[9] = output[9]; 1129 step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); 1130 step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); 1131 step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); 1132 step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); 1133 step[14] = output[14]; 1134 step[15] = output[15]; 1135 1136 step[16] = output[16] + output[23]; 1137 step[17] = output[17] + output[22]; 1138 step[18] = output[18] + output[21]; 1139 step[19] = output[19] + output[20]; 1140 step[20] = -output[20] + output[19]; 1141 step[21] = -output[21] + output[18]; 1142 step[22] = -output[22] + output[17]; 1143 step[23] = -output[23] + output[16]; 1144 step[24] = -output[24] + output[31]; 1145 step[25] = -output[25] + output[30]; 1146 step[26] = -output[26] + output[29]; 1147 step[27] = -output[27] + output[28]; 1148 step[28] = output[28] + output[27]; 1149 step[29] = output[29] + output[26]; 1150 step[30] = output[30] + output[25]; 1151 step[31] = output[31] + output[24]; 1152 1153 // Stage 4 1154 output[0] = step[0] + step[3]; 1155 output[1] = step[1] + step[2]; 1156 output[2] = -step[2] + step[1]; 1157 output[3] = -step[3] + step[0]; 1158 output[4] = step[4]; 1159 output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); 1160 output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); 1161 output[7] = step[7]; 1162 output[8] = step[8] + step[11]; 1163 output[9] = step[9] + step[10]; 1164 output[10] = -step[10] + step[9]; 1165 output[11] = -step[11] + step[8]; 1166 output[12] = -step[12] + step[15]; 1167 output[13] = -step[13] + step[14]; 1168 output[14] = step[14] + step[13]; 1169 output[15] = step[15] + step[12]; 1170 1171 output[16] = step[16]; 1172 output[17] = step[17]; 1173 output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); 1174 output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); 1175 output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); 1176 output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); 1177 output[22] = step[22]; 1178 output[23] = step[23]; 1179 output[24] = step[24]; 1180 output[25] = step[25]; 1181 output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); 1182 output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); 1183 output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); 1184 output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); 1185 output[30] = step[30]; 1186 output[31] = step[31]; 1187 1188 // Stage 5 1189 step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); 1190 step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); 1191 step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); 1192 step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); 1193 step[4] = output[4] + output[5]; 1194 step[5] = -output[5] + output[4]; 1195 step[6] = -output[6] + output[7]; 1196 step[7] = output[7] + output[6]; 1197 step[8] = output[8]; 1198 step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); 1199 step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); 1200 step[11] = output[11]; 1201 step[12] = output[12]; 1202 step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); 1203 step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); 1204 step[15] = output[15]; 1205 1206 step[16] = output[16] + output[19]; 1207 step[17] = output[17] + output[18]; 1208 step[18] = -output[18] + output[17]; 1209 step[19] = -output[19] + output[16]; 1210 step[20] = -output[20] + output[23]; 1211 step[21] = -output[21] + output[22]; 1212 step[22] = output[22] + output[21]; 1213 step[23] = output[23] + output[20]; 1214 step[24] = output[24] + output[27]; 1215 step[25] = output[25] + output[26]; 1216 step[26] = -output[26] + output[25]; 1217 step[27] = -output[27] + output[24]; 1218 step[28] = -output[28] + output[31]; 1219 step[29] = -output[29] + output[30]; 1220 step[30] = output[30] + output[29]; 1221 step[31] = output[31] + output[28]; 1222 1223 // Stage 6 1224 output[0] = step[0]; 1225 output[1] = step[1]; 1226 output[2] = step[2]; 1227 output[3] = step[3]; 1228 output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); 1229 output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); 1230 output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); 1231 output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); 1232 output[8] = step[8] + step[9]; 1233 output[9] = -step[9] + step[8]; 1234 output[10] = -step[10] + step[11]; 1235 output[11] = step[11] + step[10]; 1236 output[12] = step[12] + step[13]; 1237 output[13] = -step[13] + step[12]; 1238 output[14] = -step[14] + step[15]; 1239 output[15] = step[15] + step[14]; 1240 1241 output[16] = step[16]; 1242 output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); 1243 output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); 1244 output[19] = step[19]; 1245 output[20] = step[20]; 1246 output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); 1247 output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); 1248 output[23] = step[23]; 1249 output[24] = step[24]; 1250 output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); 1251 output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); 1252 output[27] = step[27]; 1253 output[28] = step[28]; 1254 output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); 1255 output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); 1256 output[31] = step[31]; 1257 1258 // Stage 7 1259 step[0] = output[0]; 1260 step[1] = output[1]; 1261 step[2] = output[2]; 1262 step[3] = output[3]; 1263 step[4] = output[4]; 1264 step[5] = output[5]; 1265 step[6] = output[6]; 1266 step[7] = output[7]; 1267 step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); 1268 step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); 1269 step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); 1270 step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); 1271 step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); 1272 step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); 1273 step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); 1274 step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); 1275 1276 step[16] = output[16] + output[17]; 1277 step[17] = -output[17] + output[16]; 1278 step[18] = -output[18] + output[19]; 1279 step[19] = output[19] + output[18]; 1280 step[20] = output[20] + output[21]; 1281 step[21] = -output[21] + output[20]; 1282 step[22] = -output[22] + output[23]; 1283 step[23] = output[23] + output[22]; 1284 step[24] = output[24] + output[25]; 1285 step[25] = -output[25] + output[24]; 1286 step[26] = -output[26] + output[27]; 1287 step[27] = output[27] + output[26]; 1288 step[28] = output[28] + output[29]; 1289 step[29] = -output[29] + output[28]; 1290 step[30] = -output[30] + output[31]; 1291 step[31] = output[31] + output[30]; 1292 1293 // Final stage --- outputs indices are bit-reversed. 1294 output[0] = step[0]; 1295 output[16] = step[1]; 1296 output[8] = step[2]; 1297 output[24] = step[3]; 1298 output[4] = step[4]; 1299 output[20] = step[5]; 1300 output[12] = step[6]; 1301 output[28] = step[7]; 1302 output[2] = step[8]; 1303 output[18] = step[9]; 1304 output[10] = step[10]; 1305 output[26] = step[11]; 1306 output[6] = step[12]; 1307 output[22] = step[13]; 1308 output[14] = step[14]; 1309 output[30] = step[15]; 1310 1311 output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); 1312 output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); 1313 output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); 1314 output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); 1315 output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); 1316 output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); 1317 output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); 1318 output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); 1319 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); 1320 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); 1321 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); 1322 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); 1323 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); 1324 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); 1325 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); 1326 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); 1327} 1328 1329void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { 1330 int shortpitch = pitch >> 1; 1331 int i, j; 1332 int output[32 * 32]; 1333 1334 // Columns 1335 for (i = 0; i < 32; ++i) { 1336 int temp_in[32], temp_out[32]; 1337 for (j = 0; j < 32; ++j) 1338 temp_in[j] = input[j * shortpitch + i] << 2; 1339 dct32_1d(temp_in, temp_out, 0); 1340 for (j = 0; j < 32; ++j) 1341 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1342 } 1343 1344 // Rows 1345 for (i = 0; i < 32; ++i) { 1346 int temp_in[32], temp_out[32]; 1347 for (j = 0; j < 32; ++j) 1348 temp_in[j] = output[j + i * 32]; 1349 dct32_1d(temp_in, temp_out, 0); 1350 for (j = 0; j < 32; ++j) 1351 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 1352 } 1353} 1354 1355// Note that although we use dct_32_round in dct32_1d computation flow, 1356// this 2d fdct32x32 for rate-distortion optimization loop is operating 1357// within 16 bits precision. 1358void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) { 1359 int shortpitch = pitch >> 1; 1360 int i, j; 1361 int output[32 * 32]; 1362 1363 // Columns 1364 for (i = 0; i < 32; ++i) { 1365 int temp_in[32], temp_out[32]; 1366 for (j = 0; j < 32; ++j) 1367 temp_in[j] = input[j * shortpitch + i] << 2; 1368 dct32_1d(temp_in, temp_out, 0); 1369 for (j = 0; j < 32; ++j) 1370 // TODO(cd): see quality impact of only doing 1371 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; 1372 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c 1373 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1374 } 1375 1376 // Rows 1377 for (i = 0; i < 32; ++i) { 1378 int temp_in[32], temp_out[32]; 1379 for (j = 0; j < 32; ++j) 1380 temp_in[j] = output[j + i * 32]; 1381 dct32_1d(temp_in, temp_out, 1); 1382 for (j = 0; j < 32; ++j) 1383 out[j + i * 32] = temp_out[j]; 1384 } 1385} 1386