1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <math.h> 12#include <string.h> 13 14#include "vpx_dsp/inv_txfm.h" 15 16void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 17/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 18 0.5 shifts per pixel. */ 19 int i; 20 tran_low_t output[16]; 21 tran_high_t a1, b1, c1, d1, e1; 22 const tran_low_t *ip = input; 23 tran_low_t *op = output; 24 25 for (i = 0; i < 4; i++) { 26 a1 = ip[0] >> UNIT_QUANT_SHIFT; 27 c1 = ip[1] >> UNIT_QUANT_SHIFT; 28 d1 = ip[2] >> UNIT_QUANT_SHIFT; 29 b1 = ip[3] >> UNIT_QUANT_SHIFT; 30 a1 += c1; 31 d1 -= b1; 32 e1 = (a1 - d1) >> 1; 33 b1 = e1 - b1; 34 c1 = e1 - c1; 35 a1 -= b1; 36 d1 += c1; 37 op[0] = WRAPLOW(a1, 8); 38 op[1] = WRAPLOW(b1, 8); 39 op[2] = WRAPLOW(c1, 8); 40 op[3] = WRAPLOW(d1, 8); 41 ip += 4; 42 op += 4; 43 } 44 45 ip = output; 46 for (i = 0; i < 4; i++) { 47 a1 = ip[4 * 0]; 48 c1 = ip[4 * 1]; 49 d1 = ip[4 * 2]; 50 b1 = ip[4 * 3]; 51 a1 += c1; 52 d1 -= b1; 53 e1 = (a1 - d1) >> 1; 54 b1 = e1 - b1; 55 c1 = e1 - c1; 56 a1 -= b1; 57 d1 += c1; 58 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1); 59 dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1); 60 dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1); 61 dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1); 62 63 ip++; 64 dest++; 65 } 66} 67 68void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { 69 int i; 70 tran_high_t a1, e1; 71 tran_low_t tmp[4]; 72 const tran_low_t *ip = in; 73 tran_low_t *op = tmp; 74 75 a1 = ip[0] >> UNIT_QUANT_SHIFT; 76 e1 = a1 >> 1; 77 a1 -= e1; 78 op[0] = WRAPLOW(a1, 8); 79 op[1] = op[2] = op[3] = WRAPLOW(e1, 8); 80 81 ip = tmp; 82 for (i = 0; i < 4; i++) { 83 e1 = ip[0] >> 1; 84 a1 = ip[0] - e1; 85 dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); 86 dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); 87 dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); 88 dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); 89 ip++; 90 dest++; 91 } 92} 93 94void idct4_c(const tran_low_t *input, tran_low_t *output) { 95 tran_low_t step[4]; 96 tran_high_t temp1, temp2; 97 // stage 1 98 temp1 = (input[0] + input[2]) * cospi_16_64; 99 temp2 = (input[0] - input[2]) * cospi_16_64; 100 step[0] = WRAPLOW(dct_const_round_shift(temp1), 8); 101 step[1] = WRAPLOW(dct_const_round_shift(temp2), 8); 102 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 103 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 104 step[2] = WRAPLOW(dct_const_round_shift(temp1), 8); 105 step[3] = WRAPLOW(dct_const_round_shift(temp2), 8); 106 107 // stage 2 108 output[0] = WRAPLOW(step[0] + step[3], 8); 109 output[1] = WRAPLOW(step[1] + step[2], 8); 110 output[2] = WRAPLOW(step[1] - step[2], 8); 111 output[3] = WRAPLOW(step[0] - step[3], 8); 112} 113 114void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 115 tran_low_t out[4 * 4]; 116 tran_low_t *outptr = out; 117 int i, j; 118 tran_low_t temp_in[4], temp_out[4]; 119 120 // Rows 121 for (i = 0; i < 4; ++i) { 122 idct4_c(input, outptr); 123 input += 4; 124 outptr += 4; 125 } 126 127 // Columns 128 for (i = 0; i < 4; ++i) { 129 for (j = 0; j < 4; ++j) 130 temp_in[j] = out[j * 4 + i]; 131 idct4_c(temp_in, temp_out); 132 for (j = 0; j < 4; ++j) { 133 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 134 ROUND_POWER_OF_TWO(temp_out[j], 4)); 135 } 136 } 137} 138 139void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, 140 int dest_stride) { 141 int i; 142 tran_high_t a1; 143 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); 144 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); 145 a1 = ROUND_POWER_OF_TWO(out, 4); 146 147 for (i = 0; i < 4; i++) { 148 dest[0] = clip_pixel_add(dest[0], a1); 149 dest[1] = clip_pixel_add(dest[1], a1); 150 dest[2] = clip_pixel_add(dest[2], a1); 151 dest[3] = clip_pixel_add(dest[3], a1); 152 dest += dest_stride; 153 } 154} 155 156void idct8_c(const tran_low_t *input, tran_low_t *output) { 157 tran_low_t step1[8], step2[8]; 158 tran_high_t temp1, temp2; 159 // stage 1 160 step1[0] = input[0]; 161 step1[2] = input[4]; 162 step1[1] = input[2]; 163 step1[3] = input[6]; 164 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 165 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 166 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); 167 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); 168 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 169 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 170 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 171 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 172 173 // stage 2 & stage 3 - even half 174 idct4_c(step1, step1); 175 176 // stage 2 - odd half 177 step2[4] = WRAPLOW(step1[4] + step1[5], 8); 178 step2[5] = WRAPLOW(step1[4] - step1[5], 8); 179 step2[6] = WRAPLOW(-step1[6] + step1[7], 8); 180 step2[7] = WRAPLOW(step1[6] + step1[7], 8); 181 182 // stage 3 -odd half 183 step1[4] = step2[4]; 184 temp1 = (step2[6] - step2[5]) * cospi_16_64; 185 temp2 = (step2[5] + step2[6]) * cospi_16_64; 186 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 187 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 188 step1[7] = step2[7]; 189 190 // stage 4 191 output[0] = WRAPLOW(step1[0] + step1[7], 8); 192 output[1] = WRAPLOW(step1[1] + step1[6], 8); 193 output[2] = WRAPLOW(step1[2] + step1[5], 8); 194 output[3] = WRAPLOW(step1[3] + step1[4], 8); 195 output[4] = WRAPLOW(step1[3] - step1[4], 8); 196 output[5] = WRAPLOW(step1[2] - step1[5], 8); 197 output[6] = WRAPLOW(step1[1] - step1[6], 8); 198 output[7] = WRAPLOW(step1[0] - step1[7], 8); 199} 200 201void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 202 tran_low_t out[8 * 8]; 203 tran_low_t *outptr = out; 204 int i, j; 205 tran_low_t temp_in[8], temp_out[8]; 206 207 // First transform rows 208 for (i = 0; i < 8; ++i) { 209 idct8_c(input, outptr); 210 input += 8; 211 outptr += 8; 212 } 213 214 // Then transform columns 215 for (i = 0; i < 8; ++i) { 216 for (j = 0; j < 8; ++j) 217 temp_in[j] = out[j * 8 + i]; 218 idct8_c(temp_in, temp_out); 219 for (j = 0; j < 8; ++j) { 220 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 221 ROUND_POWER_OF_TWO(temp_out[j], 5)); 222 } 223 } 224} 225 226void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 227 int i, j; 228 tran_high_t a1; 229 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); 230 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); 231 a1 = ROUND_POWER_OF_TWO(out, 5); 232 for (j = 0; j < 8; ++j) { 233 for (i = 0; i < 8; ++i) 234 dest[i] = clip_pixel_add(dest[i], a1); 235 dest += stride; 236 } 237} 238 239void iadst4_c(const tran_low_t *input, tran_low_t *output) { 240 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 241 242 tran_low_t x0 = input[0]; 243 tran_low_t x1 = input[1]; 244 tran_low_t x2 = input[2]; 245 tran_low_t x3 = input[3]; 246 247 if (!(x0 | x1 | x2 | x3)) { 248 output[0] = output[1] = output[2] = output[3] = 0; 249 return; 250 } 251 252 s0 = sinpi_1_9 * x0; 253 s1 = sinpi_2_9 * x0; 254 s2 = sinpi_3_9 * x1; 255 s3 = sinpi_4_9 * x2; 256 s4 = sinpi_1_9 * x2; 257 s5 = sinpi_2_9 * x3; 258 s6 = sinpi_4_9 * x3; 259 s7 = x0 - x2 + x3; 260 261 s0 = s0 + s3 + s5; 262 s1 = s1 - s4 - s6; 263 s3 = s2; 264 s2 = sinpi_3_9 * s7; 265 266 // 1-D transform scaling factor is sqrt(2). 267 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 268 // + 1b (addition) = 29b. 269 // Hence the output bit depth is 15b. 270 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8); 271 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8); 272 output[2] = WRAPLOW(dct_const_round_shift(s2), 8); 273 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8); 274} 275 276void iadst8_c(const tran_low_t *input, tran_low_t *output) { 277 int s0, s1, s2, s3, s4, s5, s6, s7; 278 279 tran_high_t x0 = input[7]; 280 tran_high_t x1 = input[0]; 281 tran_high_t x2 = input[5]; 282 tran_high_t x3 = input[2]; 283 tran_high_t x4 = input[3]; 284 tran_high_t x5 = input[4]; 285 tran_high_t x6 = input[1]; 286 tran_high_t x7 = input[6]; 287 288 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 289 output[0] = output[1] = output[2] = output[3] = output[4] 290 = output[5] = output[6] = output[7] = 0; 291 return; 292 } 293 294 // stage 1 295 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); 296 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); 297 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); 298 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); 299 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); 300 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); 301 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); 302 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); 303 304 x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8); 305 x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8); 306 x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8); 307 x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8); 308 x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8); 309 x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8); 310 x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8); 311 x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8); 312 313 // stage 2 314 s0 = (int)x0; 315 s1 = (int)x1; 316 s2 = (int)x2; 317 s3 = (int)x3; 318 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); 319 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); 320 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); 321 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); 322 323 x0 = WRAPLOW(s0 + s2, 8); 324 x1 = WRAPLOW(s1 + s3, 8); 325 x2 = WRAPLOW(s0 - s2, 8); 326 x3 = WRAPLOW(s1 - s3, 8); 327 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); 328 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); 329 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); 330 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); 331 332 // stage 3 333 s2 = (int)(cospi_16_64 * (x2 + x3)); 334 s3 = (int)(cospi_16_64 * (x2 - x3)); 335 s6 = (int)(cospi_16_64 * (x6 + x7)); 336 s7 = (int)(cospi_16_64 * (x6 - x7)); 337 338 x2 = WRAPLOW(dct_const_round_shift(s2), 8); 339 x3 = WRAPLOW(dct_const_round_shift(s3), 8); 340 x6 = WRAPLOW(dct_const_round_shift(s6), 8); 341 x7 = WRAPLOW(dct_const_round_shift(s7), 8); 342 343 output[0] = WRAPLOW(x0, 8); 344 output[1] = WRAPLOW(-x4, 8); 345 output[2] = WRAPLOW(x6, 8); 346 output[3] = WRAPLOW(-x2, 8); 347 output[4] = WRAPLOW(x3, 8); 348 output[5] = WRAPLOW(-x7, 8); 349 output[6] = WRAPLOW(x5, 8); 350 output[7] = WRAPLOW(-x1, 8); 351} 352 353void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 354 tran_low_t out[8 * 8] = { 0 }; 355 tran_low_t *outptr = out; 356 int i, j; 357 tran_low_t temp_in[8], temp_out[8]; 358 359 // First transform rows 360 // only first 4 row has non-zero coefs 361 for (i = 0; i < 4; ++i) { 362 idct8_c(input, outptr); 363 input += 8; 364 outptr += 8; 365 } 366 367 // Then transform columns 368 for (i = 0; i < 8; ++i) { 369 for (j = 0; j < 8; ++j) 370 temp_in[j] = out[j * 8 + i]; 371 idct8_c(temp_in, temp_out); 372 for (j = 0; j < 8; ++j) { 373 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 374 ROUND_POWER_OF_TWO(temp_out[j], 5)); 375 } 376 } 377} 378 379void idct16_c(const tran_low_t *input, tran_low_t *output) { 380 tran_low_t step1[16], step2[16]; 381 tran_high_t temp1, temp2; 382 383 // stage 1 384 step1[0] = input[0/2]; 385 step1[1] = input[16/2]; 386 step1[2] = input[8/2]; 387 step1[3] = input[24/2]; 388 step1[4] = input[4/2]; 389 step1[5] = input[20/2]; 390 step1[6] = input[12/2]; 391 step1[7] = input[28/2]; 392 step1[8] = input[2/2]; 393 step1[9] = input[18/2]; 394 step1[10] = input[10/2]; 395 step1[11] = input[26/2]; 396 step1[12] = input[6/2]; 397 step1[13] = input[22/2]; 398 step1[14] = input[14/2]; 399 step1[15] = input[30/2]; 400 401 // stage 2 402 step2[0] = step1[0]; 403 step2[1] = step1[1]; 404 step2[2] = step1[2]; 405 step2[3] = step1[3]; 406 step2[4] = step1[4]; 407 step2[5] = step1[5]; 408 step2[6] = step1[6]; 409 step2[7] = step1[7]; 410 411 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 412 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 413 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); 414 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); 415 416 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 417 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 418 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); 419 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); 420 421 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 422 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 423 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 424 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 425 426 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 427 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 428 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); 429 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); 430 431 // stage 3 432 step1[0] = step2[0]; 433 step1[1] = step2[1]; 434 step1[2] = step2[2]; 435 step1[3] = step2[3]; 436 437 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 438 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 439 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); 440 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); 441 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 442 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 443 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 444 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 445 446 step1[8] = WRAPLOW(step2[8] + step2[9], 8); 447 step1[9] = WRAPLOW(step2[8] - step2[9], 8); 448 step1[10] = WRAPLOW(-step2[10] + step2[11], 8); 449 step1[11] = WRAPLOW(step2[10] + step2[11], 8); 450 step1[12] = WRAPLOW(step2[12] + step2[13], 8); 451 step1[13] = WRAPLOW(step2[12] - step2[13], 8); 452 step1[14] = WRAPLOW(-step2[14] + step2[15], 8); 453 step1[15] = WRAPLOW(step2[14] + step2[15], 8); 454 455 // stage 4 456 temp1 = (step1[0] + step1[1]) * cospi_16_64; 457 temp2 = (step1[0] - step1[1]) * cospi_16_64; 458 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); 459 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); 460 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 461 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 462 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); 463 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); 464 step2[4] = WRAPLOW(step1[4] + step1[5], 8); 465 step2[5] = WRAPLOW(step1[4] - step1[5], 8); 466 step2[6] = WRAPLOW(-step1[6] + step1[7], 8); 467 step2[7] = WRAPLOW(step1[6] + step1[7], 8); 468 469 step2[8] = step1[8]; 470 step2[15] = step1[15]; 471 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 472 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 473 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); 474 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); 475 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 476 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 477 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 478 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 479 step2[11] = step1[11]; 480 step2[12] = step1[12]; 481 482 // stage 5 483 step1[0] = WRAPLOW(step2[0] + step2[3], 8); 484 step1[1] = WRAPLOW(step2[1] + step2[2], 8); 485 step1[2] = WRAPLOW(step2[1] - step2[2], 8); 486 step1[3] = WRAPLOW(step2[0] - step2[3], 8); 487 step1[4] = step2[4]; 488 temp1 = (step2[6] - step2[5]) * cospi_16_64; 489 temp2 = (step2[5] + step2[6]) * cospi_16_64; 490 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 491 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 492 step1[7] = step2[7]; 493 494 step1[8] = WRAPLOW(step2[8] + step2[11], 8); 495 step1[9] = WRAPLOW(step2[9] + step2[10], 8); 496 step1[10] = WRAPLOW(step2[9] - step2[10], 8); 497 step1[11] = WRAPLOW(step2[8] - step2[11], 8); 498 step1[12] = WRAPLOW(-step2[12] + step2[15], 8); 499 step1[13] = WRAPLOW(-step2[13] + step2[14], 8); 500 step1[14] = WRAPLOW(step2[13] + step2[14], 8); 501 step1[15] = WRAPLOW(step2[12] + step2[15], 8); 502 503 // stage 6 504 step2[0] = WRAPLOW(step1[0] + step1[7], 8); 505 step2[1] = WRAPLOW(step1[1] + step1[6], 8); 506 step2[2] = WRAPLOW(step1[2] + step1[5], 8); 507 step2[3] = WRAPLOW(step1[3] + step1[4], 8); 508 step2[4] = WRAPLOW(step1[3] - step1[4], 8); 509 step2[5] = WRAPLOW(step1[2] - step1[5], 8); 510 step2[6] = WRAPLOW(step1[1] - step1[6], 8); 511 step2[7] = WRAPLOW(step1[0] - step1[7], 8); 512 step2[8] = step1[8]; 513 step2[9] = step1[9]; 514 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 515 temp2 = (step1[10] + step1[13]) * cospi_16_64; 516 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 517 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 518 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 519 temp2 = (step1[11] + step1[12]) * cospi_16_64; 520 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); 521 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); 522 step2[14] = step1[14]; 523 step2[15] = step1[15]; 524 525 // stage 7 526 output[0] = WRAPLOW(step2[0] + step2[15], 8); 527 output[1] = WRAPLOW(step2[1] + step2[14], 8); 528 output[2] = WRAPLOW(step2[2] + step2[13], 8); 529 output[3] = WRAPLOW(step2[3] + step2[12], 8); 530 output[4] = WRAPLOW(step2[4] + step2[11], 8); 531 output[5] = WRAPLOW(step2[5] + step2[10], 8); 532 output[6] = WRAPLOW(step2[6] + step2[9], 8); 533 output[7] = WRAPLOW(step2[7] + step2[8], 8); 534 output[8] = WRAPLOW(step2[7] - step2[8], 8); 535 output[9] = WRAPLOW(step2[6] - step2[9], 8); 536 output[10] = WRAPLOW(step2[5] - step2[10], 8); 537 output[11] = WRAPLOW(step2[4] - step2[11], 8); 538 output[12] = WRAPLOW(step2[3] - step2[12], 8); 539 output[13] = WRAPLOW(step2[2] - step2[13], 8); 540 output[14] = WRAPLOW(step2[1] - step2[14], 8); 541 output[15] = WRAPLOW(step2[0] - step2[15], 8); 542} 543 544void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, 545 int stride) { 546 tran_low_t out[16 * 16]; 547 tran_low_t *outptr = out; 548 int i, j; 549 tran_low_t temp_in[16], temp_out[16]; 550 551 // First transform rows 552 for (i = 0; i < 16; ++i) { 553 idct16_c(input, outptr); 554 input += 16; 555 outptr += 16; 556 } 557 558 // Then transform columns 559 for (i = 0; i < 16; ++i) { 560 for (j = 0; j < 16; ++j) 561 temp_in[j] = out[j * 16 + i]; 562 idct16_c(temp_in, temp_out); 563 for (j = 0; j < 16; ++j) { 564 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 565 ROUND_POWER_OF_TWO(temp_out[j], 6)); 566 } 567 } 568} 569 570void iadst16_c(const tran_low_t *input, tran_low_t *output) { 571 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 572 tran_high_t s9, s10, s11, s12, s13, s14, s15; 573 574 tran_high_t x0 = input[15]; 575 tran_high_t x1 = input[0]; 576 tran_high_t x2 = input[13]; 577 tran_high_t x3 = input[2]; 578 tran_high_t x4 = input[11]; 579 tran_high_t x5 = input[4]; 580 tran_high_t x6 = input[9]; 581 tran_high_t x7 = input[6]; 582 tran_high_t x8 = input[7]; 583 tran_high_t x9 = input[8]; 584 tran_high_t x10 = input[5]; 585 tran_high_t x11 = input[10]; 586 tran_high_t x12 = input[3]; 587 tran_high_t x13 = input[12]; 588 tran_high_t x14 = input[1]; 589 tran_high_t x15 = input[14]; 590 591 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 592 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { 593 output[0] = output[1] = output[2] = output[3] = output[4] 594 = output[5] = output[6] = output[7] = output[8] 595 = output[9] = output[10] = output[11] = output[12] 596 = output[13] = output[14] = output[15] = 0; 597 return; 598 } 599 600 // stage 1 601 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 602 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 603 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 604 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 605 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 606 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 607 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 608 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 609 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 610 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 611 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 612 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 613 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 614 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 615 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 616 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 617 618 x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8); 619 x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8); 620 x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8); 621 x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8); 622 x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8); 623 x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8); 624 x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8); 625 x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8); 626 x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8); 627 x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8); 628 x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8); 629 x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8); 630 x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8); 631 x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8); 632 x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8); 633 x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8); 634 635 // stage 2 636 s0 = x0; 637 s1 = x1; 638 s2 = x2; 639 s3 = x3; 640 s4 = x4; 641 s5 = x5; 642 s6 = x6; 643 s7 = x7; 644 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 645 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 646 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 647 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 648 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 649 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 650 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 651 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 652 653 x0 = WRAPLOW(s0 + s4, 8); 654 x1 = WRAPLOW(s1 + s5, 8); 655 x2 = WRAPLOW(s2 + s6, 8); 656 x3 = WRAPLOW(s3 + s7, 8); 657 x4 = WRAPLOW(s0 - s4, 8); 658 x5 = WRAPLOW(s1 - s5, 8); 659 x6 = WRAPLOW(s2 - s6, 8); 660 x7 = WRAPLOW(s3 - s7, 8); 661 x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8); 662 x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8); 663 x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8); 664 x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8); 665 x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8); 666 x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8); 667 x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8); 668 x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8); 669 670 // stage 3 671 s0 = x0; 672 s1 = x1; 673 s2 = x2; 674 s3 = x3; 675 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 676 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 677 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 678 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 679 s8 = x8; 680 s9 = x9; 681 s10 = x10; 682 s11 = x11; 683 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 684 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 685 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 686 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 687 688 x0 = WRAPLOW(check_range(s0 + s2), 8); 689 x1 = WRAPLOW(check_range(s1 + s3), 8); 690 x2 = WRAPLOW(check_range(s0 - s2), 8); 691 x3 = WRAPLOW(check_range(s1 - s3), 8); 692 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); 693 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); 694 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); 695 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); 696 x8 = WRAPLOW(check_range(s8 + s10), 8); 697 x9 = WRAPLOW(check_range(s9 + s11), 8); 698 x10 = WRAPLOW(check_range(s8 - s10), 8); 699 x11 = WRAPLOW(check_range(s9 - s11), 8); 700 x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8); 701 x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8); 702 x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8); 703 x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8); 704 705 // stage 4 706 s2 = (- cospi_16_64) * (x2 + x3); 707 s3 = cospi_16_64 * (x2 - x3); 708 s6 = cospi_16_64 * (x6 + x7); 709 s7 = cospi_16_64 * (- x6 + x7); 710 s10 = cospi_16_64 * (x10 + x11); 711 s11 = cospi_16_64 * (- x10 + x11); 712 s14 = (- cospi_16_64) * (x14 + x15); 713 s15 = cospi_16_64 * (x14 - x15); 714 715 x2 = WRAPLOW(dct_const_round_shift(s2), 8); 716 x3 = WRAPLOW(dct_const_round_shift(s3), 8); 717 x6 = WRAPLOW(dct_const_round_shift(s6), 8); 718 x7 = WRAPLOW(dct_const_round_shift(s7), 8); 719 x10 = WRAPLOW(dct_const_round_shift(s10), 8); 720 x11 = WRAPLOW(dct_const_round_shift(s11), 8); 721 x14 = WRAPLOW(dct_const_round_shift(s14), 8); 722 x15 = WRAPLOW(dct_const_round_shift(s15), 8); 723 724 output[0] = WRAPLOW(x0, 8); 725 output[1] = WRAPLOW(-x8, 8); 726 output[2] = WRAPLOW(x12, 8); 727 output[3] = WRAPLOW(-x4, 8); 728 output[4] = WRAPLOW(x6, 8); 729 output[5] = WRAPLOW(x14, 8); 730 output[6] = WRAPLOW(x10, 8); 731 output[7] = WRAPLOW(x2, 8); 732 output[8] = WRAPLOW(x3, 8); 733 output[9] = WRAPLOW(x11, 8); 734 output[10] = WRAPLOW(x15, 8); 735 output[11] = WRAPLOW(x7, 8); 736 output[12] = WRAPLOW(x5, 8); 737 output[13] = WRAPLOW(-x13, 8); 738 output[14] = WRAPLOW(x9, 8); 739 output[15] = WRAPLOW(-x1, 8); 740} 741 742void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, 743 int stride) { 744 tran_low_t out[16 * 16] = { 0 }; 745 tran_low_t *outptr = out; 746 int i, j; 747 tran_low_t temp_in[16], temp_out[16]; 748 749 // First transform rows. Since all non-zero dct coefficients are in 750 // upper-left 4x4 area, we only need to calculate first 4 rows here. 751 for (i = 0; i < 4; ++i) { 752 idct16_c(input, outptr); 753 input += 16; 754 outptr += 16; 755 } 756 757 // Then transform columns 758 for (i = 0; i < 16; ++i) { 759 for (j = 0; j < 16; ++j) 760 temp_in[j] = out[j*16 + i]; 761 idct16_c(temp_in, temp_out); 762 for (j = 0; j < 16; ++j) { 763 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 764 ROUND_POWER_OF_TWO(temp_out[j], 6)); 765 } 766 } 767} 768 769void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 770 int i, j; 771 tran_high_t a1; 772 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); 773 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); 774 a1 = ROUND_POWER_OF_TWO(out, 6); 775 for (j = 0; j < 16; ++j) { 776 for (i = 0; i < 16; ++i) 777 dest[i] = clip_pixel_add(dest[i], a1); 778 dest += stride; 779 } 780} 781 782void idct32_c(const tran_low_t *input, tran_low_t *output) { 783 tran_low_t step1[32], step2[32]; 784 tran_high_t temp1, temp2; 785 786 // stage 1 787 step1[0] = input[0]; 788 step1[1] = input[16]; 789 step1[2] = input[8]; 790 step1[3] = input[24]; 791 step1[4] = input[4]; 792 step1[5] = input[20]; 793 step1[6] = input[12]; 794 step1[7] = input[28]; 795 step1[8] = input[2]; 796 step1[9] = input[18]; 797 step1[10] = input[10]; 798 step1[11] = input[26]; 799 step1[12] = input[6]; 800 step1[13] = input[22]; 801 step1[14] = input[14]; 802 step1[15] = input[30]; 803 804 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 805 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 806 step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8); 807 step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8); 808 809 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 810 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 811 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); 812 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); 813 814 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 815 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 816 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); 817 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); 818 819 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 820 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 821 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); 822 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); 823 824 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 825 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 826 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); 827 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); 828 829 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 830 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 831 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); 832 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); 833 834 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 835 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 836 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); 837 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); 838 839 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 840 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 841 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); 842 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); 843 844 // stage 2 845 step2[0] = step1[0]; 846 step2[1] = step1[1]; 847 step2[2] = step1[2]; 848 step2[3] = step1[3]; 849 step2[4] = step1[4]; 850 step2[5] = step1[5]; 851 step2[6] = step1[6]; 852 step2[7] = step1[7]; 853 854 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 855 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 856 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); 857 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); 858 859 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 860 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 861 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); 862 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); 863 864 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 865 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 866 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 867 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 868 869 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 870 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 871 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); 872 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); 873 874 step2[16] = WRAPLOW(step1[16] + step1[17], 8); 875 step2[17] = WRAPLOW(step1[16] - step1[17], 8); 876 step2[18] = WRAPLOW(-step1[18] + step1[19], 8); 877 step2[19] = WRAPLOW(step1[18] + step1[19], 8); 878 step2[20] = WRAPLOW(step1[20] + step1[21], 8); 879 step2[21] = WRAPLOW(step1[20] - step1[21], 8); 880 step2[22] = WRAPLOW(-step1[22] + step1[23], 8); 881 step2[23] = WRAPLOW(step1[22] + step1[23], 8); 882 step2[24] = WRAPLOW(step1[24] + step1[25], 8); 883 step2[25] = WRAPLOW(step1[24] - step1[25], 8); 884 step2[26] = WRAPLOW(-step1[26] + step1[27], 8); 885 step2[27] = WRAPLOW(step1[26] + step1[27], 8); 886 step2[28] = WRAPLOW(step1[28] + step1[29], 8); 887 step2[29] = WRAPLOW(step1[28] - step1[29], 8); 888 step2[30] = WRAPLOW(-step1[30] + step1[31], 8); 889 step2[31] = WRAPLOW(step1[30] + step1[31], 8); 890 891 // stage 3 892 step1[0] = step2[0]; 893 step1[1] = step2[1]; 894 step1[2] = step2[2]; 895 step1[3] = step2[3]; 896 897 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 898 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 899 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); 900 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); 901 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 902 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 903 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 904 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 905 906 step1[8] = WRAPLOW(step2[8] + step2[9], 8); 907 step1[9] = WRAPLOW(step2[8] - step2[9], 8); 908 step1[10] = WRAPLOW(-step2[10] + step2[11], 8); 909 step1[11] = WRAPLOW(step2[10] + step2[11], 8); 910 step1[12] = WRAPLOW(step2[12] + step2[13], 8); 911 step1[13] = WRAPLOW(step2[12] - step2[13], 8); 912 step1[14] = WRAPLOW(-step2[14] + step2[15], 8); 913 step1[15] = WRAPLOW(step2[14] + step2[15], 8); 914 915 step1[16] = step2[16]; 916 step1[31] = step2[31]; 917 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 918 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 919 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); 920 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); 921 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 922 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 923 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); 924 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); 925 step1[19] = step2[19]; 926 step1[20] = step2[20]; 927 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 928 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 929 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); 930 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); 931 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 932 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 933 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); 934 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); 935 step1[23] = step2[23]; 936 step1[24] = step2[24]; 937 step1[27] = step2[27]; 938 step1[28] = step2[28]; 939 940 // stage 4 941 temp1 = (step1[0] + step1[1]) * cospi_16_64; 942 temp2 = (step1[0] - step1[1]) * cospi_16_64; 943 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); 944 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); 945 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 946 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 947 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); 948 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); 949 step2[4] = WRAPLOW(step1[4] + step1[5], 8); 950 step2[5] = WRAPLOW(step1[4] - step1[5], 8); 951 step2[6] = WRAPLOW(-step1[6] + step1[7], 8); 952 step2[7] = WRAPLOW(step1[6] + step1[7], 8); 953 954 step2[8] = step1[8]; 955 step2[15] = step1[15]; 956 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 957 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 958 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); 959 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); 960 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 961 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 962 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 963 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 964 step2[11] = step1[11]; 965 step2[12] = step1[12]; 966 967 step2[16] = WRAPLOW(step1[16] + step1[19], 8); 968 step2[17] = WRAPLOW(step1[17] + step1[18], 8); 969 step2[18] = WRAPLOW(step1[17] - step1[18], 8); 970 step2[19] = WRAPLOW(step1[16] - step1[19], 8); 971 step2[20] = WRAPLOW(-step1[20] + step1[23], 8); 972 step2[21] = WRAPLOW(-step1[21] + step1[22], 8); 973 step2[22] = WRAPLOW(step1[21] + step1[22], 8); 974 step2[23] = WRAPLOW(step1[20] + step1[23], 8); 975 976 step2[24] = WRAPLOW(step1[24] + step1[27], 8); 977 step2[25] = WRAPLOW(step1[25] + step1[26], 8); 978 step2[26] = WRAPLOW(step1[25] - step1[26], 8); 979 step2[27] = WRAPLOW(step1[24] - step1[27], 8); 980 step2[28] = WRAPLOW(-step1[28] + step1[31], 8); 981 step2[29] = WRAPLOW(-step1[29] + step1[30], 8); 982 step2[30] = WRAPLOW(step1[29] + step1[30], 8); 983 step2[31] = WRAPLOW(step1[28] + step1[31], 8); 984 985 // stage 5 986 step1[0] = WRAPLOW(step2[0] + step2[3], 8); 987 step1[1] = WRAPLOW(step2[1] + step2[2], 8); 988 step1[2] = WRAPLOW(step2[1] - step2[2], 8); 989 step1[3] = WRAPLOW(step2[0] - step2[3], 8); 990 step1[4] = step2[4]; 991 temp1 = (step2[6] - step2[5]) * cospi_16_64; 992 temp2 = (step2[5] + step2[6]) * cospi_16_64; 993 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 994 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 995 step1[7] = step2[7]; 996 997 step1[8] = WRAPLOW(step2[8] + step2[11], 8); 998 step1[9] = WRAPLOW(step2[9] + step2[10], 8); 999 step1[10] = WRAPLOW(step2[9] - step2[10], 8); 1000 step1[11] = WRAPLOW(step2[8] - step2[11], 8); 1001 step1[12] = WRAPLOW(-step2[12] + step2[15], 8); 1002 step1[13] = WRAPLOW(-step2[13] + step2[14], 8); 1003 step1[14] = WRAPLOW(step2[13] + step2[14], 8); 1004 step1[15] = WRAPLOW(step2[12] + step2[15], 8); 1005 1006 step1[16] = step2[16]; 1007 step1[17] = step2[17]; 1008 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 1009 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 1010 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); 1011 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); 1012 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 1013 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 1014 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); 1015 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); 1016 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 1017 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 1018 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); 1019 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); 1020 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 1021 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 1022 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); 1023 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); 1024 step1[22] = step2[22]; 1025 step1[23] = step2[23]; 1026 step1[24] = step2[24]; 1027 step1[25] = step2[25]; 1028 step1[30] = step2[30]; 1029 step1[31] = step2[31]; 1030 1031 // stage 6 1032 step2[0] = WRAPLOW(step1[0] + step1[7], 8); 1033 step2[1] = WRAPLOW(step1[1] + step1[6], 8); 1034 step2[2] = WRAPLOW(step1[2] + step1[5], 8); 1035 step2[3] = WRAPLOW(step1[3] + step1[4], 8); 1036 step2[4] = WRAPLOW(step1[3] - step1[4], 8); 1037 step2[5] = WRAPLOW(step1[2] - step1[5], 8); 1038 step2[6] = WRAPLOW(step1[1] - step1[6], 8); 1039 step2[7] = WRAPLOW(step1[0] - step1[7], 8); 1040 step2[8] = step1[8]; 1041 step2[9] = step1[9]; 1042 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 1043 temp2 = (step1[10] + step1[13]) * cospi_16_64; 1044 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 1045 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 1046 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 1047 temp2 = (step1[11] + step1[12]) * cospi_16_64; 1048 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); 1049 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); 1050 step2[14] = step1[14]; 1051 step2[15] = step1[15]; 1052 1053 step2[16] = WRAPLOW(step1[16] + step1[23], 8); 1054 step2[17] = WRAPLOW(step1[17] + step1[22], 8); 1055 step2[18] = WRAPLOW(step1[18] + step1[21], 8); 1056 step2[19] = WRAPLOW(step1[19] + step1[20], 8); 1057 step2[20] = WRAPLOW(step1[19] - step1[20], 8); 1058 step2[21] = WRAPLOW(step1[18] - step1[21], 8); 1059 step2[22] = WRAPLOW(step1[17] - step1[22], 8); 1060 step2[23] = WRAPLOW(step1[16] - step1[23], 8); 1061 1062 step2[24] = WRAPLOW(-step1[24] + step1[31], 8); 1063 step2[25] = WRAPLOW(-step1[25] + step1[30], 8); 1064 step2[26] = WRAPLOW(-step1[26] + step1[29], 8); 1065 step2[27] = WRAPLOW(-step1[27] + step1[28], 8); 1066 step2[28] = WRAPLOW(step1[27] + step1[28], 8); 1067 step2[29] = WRAPLOW(step1[26] + step1[29], 8); 1068 step2[30] = WRAPLOW(step1[25] + step1[30], 8); 1069 step2[31] = WRAPLOW(step1[24] + step1[31], 8); 1070 1071 // stage 7 1072 step1[0] = WRAPLOW(step2[0] + step2[15], 8); 1073 step1[1] = WRAPLOW(step2[1] + step2[14], 8); 1074 step1[2] = WRAPLOW(step2[2] + step2[13], 8); 1075 step1[3] = WRAPLOW(step2[3] + step2[12], 8); 1076 step1[4] = WRAPLOW(step2[4] + step2[11], 8); 1077 step1[5] = WRAPLOW(step2[5] + step2[10], 8); 1078 step1[6] = WRAPLOW(step2[6] + step2[9], 8); 1079 step1[7] = WRAPLOW(step2[7] + step2[8], 8); 1080 step1[8] = WRAPLOW(step2[7] - step2[8], 8); 1081 step1[9] = WRAPLOW(step2[6] - step2[9], 8); 1082 step1[10] = WRAPLOW(step2[5] - step2[10], 8); 1083 step1[11] = WRAPLOW(step2[4] - step2[11], 8); 1084 step1[12] = WRAPLOW(step2[3] - step2[12], 8); 1085 step1[13] = WRAPLOW(step2[2] - step2[13], 8); 1086 step1[14] = WRAPLOW(step2[1] - step2[14], 8); 1087 step1[15] = WRAPLOW(step2[0] - step2[15], 8); 1088 1089 step1[16] = step2[16]; 1090 step1[17] = step2[17]; 1091 step1[18] = step2[18]; 1092 step1[19] = step2[19]; 1093 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 1094 temp2 = (step2[20] + step2[27]) * cospi_16_64; 1095 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); 1096 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); 1097 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 1098 temp2 = (step2[21] + step2[26]) * cospi_16_64; 1099 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); 1100 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); 1101 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 1102 temp2 = (step2[22] + step2[25]) * cospi_16_64; 1103 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); 1104 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); 1105 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 1106 temp2 = (step2[23] + step2[24]) * cospi_16_64; 1107 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); 1108 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); 1109 step1[28] = step2[28]; 1110 step1[29] = step2[29]; 1111 step1[30] = step2[30]; 1112 step1[31] = step2[31]; 1113 1114 // final stage 1115 output[0] = WRAPLOW(step1[0] + step1[31], 8); 1116 output[1] = WRAPLOW(step1[1] + step1[30], 8); 1117 output[2] = WRAPLOW(step1[2] + step1[29], 8); 1118 output[3] = WRAPLOW(step1[3] + step1[28], 8); 1119 output[4] = WRAPLOW(step1[4] + step1[27], 8); 1120 output[5] = WRAPLOW(step1[5] + step1[26], 8); 1121 output[6] = WRAPLOW(step1[6] + step1[25], 8); 1122 output[7] = WRAPLOW(step1[7] + step1[24], 8); 1123 output[8] = WRAPLOW(step1[8] + step1[23], 8); 1124 output[9] = WRAPLOW(step1[9] + step1[22], 8); 1125 output[10] = WRAPLOW(step1[10] + step1[21], 8); 1126 output[11] = WRAPLOW(step1[11] + step1[20], 8); 1127 output[12] = WRAPLOW(step1[12] + step1[19], 8); 1128 output[13] = WRAPLOW(step1[13] + step1[18], 8); 1129 output[14] = WRAPLOW(step1[14] + step1[17], 8); 1130 output[15] = WRAPLOW(step1[15] + step1[16], 8); 1131 output[16] = WRAPLOW(step1[15] - step1[16], 8); 1132 output[17] = WRAPLOW(step1[14] - step1[17], 8); 1133 output[18] = WRAPLOW(step1[13] - step1[18], 8); 1134 output[19] = WRAPLOW(step1[12] - step1[19], 8); 1135 output[20] = WRAPLOW(step1[11] - step1[20], 8); 1136 output[21] = WRAPLOW(step1[10] - step1[21], 8); 1137 output[22] = WRAPLOW(step1[9] - step1[22], 8); 1138 output[23] = WRAPLOW(step1[8] - step1[23], 8); 1139 output[24] = WRAPLOW(step1[7] - step1[24], 8); 1140 output[25] = WRAPLOW(step1[6] - step1[25], 8); 1141 output[26] = WRAPLOW(step1[5] - step1[26], 8); 1142 output[27] = WRAPLOW(step1[4] - step1[27], 8); 1143 output[28] = WRAPLOW(step1[3] - step1[28], 8); 1144 output[29] = WRAPLOW(step1[2] - step1[29], 8); 1145 output[30] = WRAPLOW(step1[1] - step1[30], 8); 1146 output[31] = WRAPLOW(step1[0] - step1[31], 8); 1147} 1148 1149void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, 1150 int stride) { 1151 tran_low_t out[32 * 32]; 1152 tran_low_t *outptr = out; 1153 int i, j; 1154 tran_low_t temp_in[32], temp_out[32]; 1155 1156 // Rows 1157 for (i = 0; i < 32; ++i) { 1158 int16_t zero_coeff[16]; 1159 for (j = 0; j < 16; ++j) 1160 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; 1161 for (j = 0; j < 8; ++j) 1162 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1163 for (j = 0; j < 4; ++j) 1164 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1165 for (j = 0; j < 2; ++j) 1166 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1167 1168 if (zero_coeff[0] | zero_coeff[1]) 1169 idct32_c(input, outptr); 1170 else 1171 memset(outptr, 0, sizeof(tran_low_t) * 32); 1172 input += 32; 1173 outptr += 32; 1174 } 1175 1176 // Columns 1177 for (i = 0; i < 32; ++i) { 1178 for (j = 0; j < 32; ++j) 1179 temp_in[j] = out[j * 32 + i]; 1180 idct32_c(temp_in, temp_out); 1181 for (j = 0; j < 32; ++j) { 1182 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 1183 ROUND_POWER_OF_TWO(temp_out[j], 6)); 1184 } 1185 } 1186} 1187 1188void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, 1189 int stride) { 1190 tran_low_t out[32 * 32] = {0}; 1191 tran_low_t *outptr = out; 1192 int i, j; 1193 tran_low_t temp_in[32], temp_out[32]; 1194 1195 // Rows 1196 // only upper-left 8x8 has non-zero coeff 1197 for (i = 0; i < 8; ++i) { 1198 idct32_c(input, outptr); 1199 input += 32; 1200 outptr += 32; 1201 } 1202 1203 // Columns 1204 for (i = 0; i < 32; ++i) { 1205 for (j = 0; j < 32; ++j) 1206 temp_in[j] = out[j * 32 + i]; 1207 idct32_c(temp_in, temp_out); 1208 for (j = 0; j < 32; ++j) { 1209 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 1210 ROUND_POWER_OF_TWO(temp_out[j], 6)); 1211 } 1212 } 1213} 1214 1215void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 1216 int i, j; 1217 tran_high_t a1; 1218 1219 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); 1220 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); 1221 a1 = ROUND_POWER_OF_TWO(out, 6); 1222 1223 for (j = 0; j < 32; ++j) { 1224 for (i = 0; i < 32; ++i) 1225 dest[i] = clip_pixel_add(dest[i], a1); 1226 dest += stride; 1227 } 1228} 1229 1230#if CONFIG_VP9_HIGHBITDEPTH 1231void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, 1232 int stride, int bd) { 1233 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 1234 0.5 shifts per pixel. */ 1235 int i; 1236 tran_low_t output[16]; 1237 tran_high_t a1, b1, c1, d1, e1; 1238 const tran_low_t *ip = input; 1239 tran_low_t *op = output; 1240 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1241 1242 for (i = 0; i < 4; i++) { 1243 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1244 c1 = ip[1] >> UNIT_QUANT_SHIFT; 1245 d1 = ip[2] >> UNIT_QUANT_SHIFT; 1246 b1 = ip[3] >> UNIT_QUANT_SHIFT; 1247 a1 += c1; 1248 d1 -= b1; 1249 e1 = (a1 - d1) >> 1; 1250 b1 = e1 - b1; 1251 c1 = e1 - c1; 1252 a1 -= b1; 1253 d1 += c1; 1254 op[0] = WRAPLOW(a1, bd); 1255 op[1] = WRAPLOW(b1, bd); 1256 op[2] = WRAPLOW(c1, bd); 1257 op[3] = WRAPLOW(d1, bd); 1258 ip += 4; 1259 op += 4; 1260 } 1261 1262 ip = output; 1263 for (i = 0; i < 4; i++) { 1264 a1 = ip[4 * 0]; 1265 c1 = ip[4 * 1]; 1266 d1 = ip[4 * 2]; 1267 b1 = ip[4 * 3]; 1268 a1 += c1; 1269 d1 -= b1; 1270 e1 = (a1 - d1) >> 1; 1271 b1 = e1 - b1; 1272 c1 = e1 - c1; 1273 a1 -= b1; 1274 d1 += c1; 1275 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); 1276 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd); 1277 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd); 1278 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd); 1279 1280 ip++; 1281 dest++; 1282 } 1283} 1284 1285void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, 1286 int dest_stride, int bd) { 1287 int i; 1288 tran_high_t a1, e1; 1289 tran_low_t tmp[4]; 1290 const tran_low_t *ip = in; 1291 tran_low_t *op = tmp; 1292 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1293 (void) bd; 1294 1295 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1296 e1 = a1 >> 1; 1297 a1 -= e1; 1298 op[0] = WRAPLOW(a1, bd); 1299 op[1] = op[2] = op[3] = WRAPLOW(e1, bd); 1300 1301 ip = tmp; 1302 for (i = 0; i < 4; i++) { 1303 e1 = ip[0] >> 1; 1304 a1 = ip[0] - e1; 1305 dest[dest_stride * 0] = highbd_clip_pixel_add( 1306 dest[dest_stride * 0], a1, bd); 1307 dest[dest_stride * 1] = highbd_clip_pixel_add( 1308 dest[dest_stride * 1], e1, bd); 1309 dest[dest_stride * 2] = highbd_clip_pixel_add( 1310 dest[dest_stride * 2], e1, bd); 1311 dest[dest_stride * 3] = highbd_clip_pixel_add( 1312 dest[dest_stride * 3], e1, bd); 1313 ip++; 1314 dest++; 1315 } 1316} 1317 1318void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { 1319 tran_low_t step[4]; 1320 tran_high_t temp1, temp2; 1321 (void) bd; 1322 // stage 1 1323 temp1 = (input[0] + input[2]) * cospi_16_64; 1324 temp2 = (input[0] - input[2]) * cospi_16_64; 1325 step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1326 step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1327 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 1328 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 1329 step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1330 step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1331 1332 // stage 2 1333 output[0] = WRAPLOW(step[0] + step[3], bd); 1334 output[1] = WRAPLOW(step[1] + step[2], bd); 1335 output[2] = WRAPLOW(step[1] - step[2], bd); 1336 output[3] = WRAPLOW(step[0] - step[3], bd); 1337} 1338 1339void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, 1340 int stride, int bd) { 1341 tran_low_t out[4 * 4]; 1342 tran_low_t *outptr = out; 1343 int i, j; 1344 tran_low_t temp_in[4], temp_out[4]; 1345 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1346 1347 // Rows 1348 for (i = 0; i < 4; ++i) { 1349 vpx_highbd_idct4_c(input, outptr, bd); 1350 input += 4; 1351 outptr += 4; 1352 } 1353 1354 // Columns 1355 for (i = 0; i < 4; ++i) { 1356 for (j = 0; j < 4; ++j) 1357 temp_in[j] = out[j * 4 + i]; 1358 vpx_highbd_idct4_c(temp_in, temp_out, bd); 1359 for (j = 0; j < 4; ++j) { 1360 dest[j * stride + i] = highbd_clip_pixel_add( 1361 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 1362 } 1363 } 1364} 1365 1366void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, 1367 int dest_stride, int bd) { 1368 int i; 1369 tran_high_t a1; 1370 tran_low_t out = WRAPLOW( 1371 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); 1372 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1373 1374 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); 1375 a1 = ROUND_POWER_OF_TWO(out, 4); 1376 1377 for (i = 0; i < 4; i++) { 1378 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); 1379 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); 1380 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); 1381 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); 1382 dest += dest_stride; 1383 } 1384} 1385 1386void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { 1387 tran_low_t step1[8], step2[8]; 1388 tran_high_t temp1, temp2; 1389 // stage 1 1390 step1[0] = input[0]; 1391 step1[2] = input[4]; 1392 step1[1] = input[2]; 1393 step1[3] = input[6]; 1394 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 1395 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 1396 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1397 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1398 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 1399 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 1400 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1401 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1402 1403 // stage 2 & stage 3 - even half 1404 vpx_highbd_idct4_c(step1, step1, bd); 1405 1406 // stage 2 - odd half 1407 step2[4] = WRAPLOW(step1[4] + step1[5], bd); 1408 step2[5] = WRAPLOW(step1[4] - step1[5], bd); 1409 step2[6] = WRAPLOW(-step1[6] + step1[7], bd); 1410 step2[7] = WRAPLOW(step1[6] + step1[7], bd); 1411 1412 // stage 3 - odd half 1413 step1[4] = step2[4]; 1414 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1415 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1416 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1417 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1418 step1[7] = step2[7]; 1419 1420 // stage 4 1421 output[0] = WRAPLOW(step1[0] + step1[7], bd); 1422 output[1] = WRAPLOW(step1[1] + step1[6], bd); 1423 output[2] = WRAPLOW(step1[2] + step1[5], bd); 1424 output[3] = WRAPLOW(step1[3] + step1[4], bd); 1425 output[4] = WRAPLOW(step1[3] - step1[4], bd); 1426 output[5] = WRAPLOW(step1[2] - step1[5], bd); 1427 output[6] = WRAPLOW(step1[1] - step1[6], bd); 1428 output[7] = WRAPLOW(step1[0] - step1[7], bd); 1429} 1430 1431void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, 1432 int stride, int bd) { 1433 tran_low_t out[8 * 8]; 1434 tran_low_t *outptr = out; 1435 int i, j; 1436 tran_low_t temp_in[8], temp_out[8]; 1437 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1438 1439 // First transform rows. 1440 for (i = 0; i < 8; ++i) { 1441 vpx_highbd_idct8_c(input, outptr, bd); 1442 input += 8; 1443 outptr += 8; 1444 } 1445 1446 // Then transform columns. 1447 for (i = 0; i < 8; ++i) { 1448 for (j = 0; j < 8; ++j) 1449 temp_in[j] = out[j * 8 + i]; 1450 vpx_highbd_idct8_c(temp_in, temp_out, bd); 1451 for (j = 0; j < 8; ++j) { 1452 dest[j * stride + i] = highbd_clip_pixel_add( 1453 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1454 } 1455 } 1456} 1457 1458void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, 1459 int stride, int bd) { 1460 int i, j; 1461 tran_high_t a1; 1462 tran_low_t out = WRAPLOW( 1463 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); 1464 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1465 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); 1466 a1 = ROUND_POWER_OF_TWO(out, 5); 1467 for (j = 0; j < 8; ++j) { 1468 for (i = 0; i < 8; ++i) 1469 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 1470 dest += stride; 1471 } 1472} 1473 1474void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { 1475 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1476 1477 tran_low_t x0 = input[0]; 1478 tran_low_t x1 = input[1]; 1479 tran_low_t x2 = input[2]; 1480 tran_low_t x3 = input[3]; 1481 (void) bd; 1482 1483 if (!(x0 | x1 | x2 | x3)) { 1484 memset(output, 0, 4 * sizeof(*output)); 1485 return; 1486 } 1487 1488 s0 = sinpi_1_9 * x0; 1489 s1 = sinpi_2_9 * x0; 1490 s2 = sinpi_3_9 * x1; 1491 s3 = sinpi_4_9 * x2; 1492 s4 = sinpi_1_9 * x2; 1493 s5 = sinpi_2_9 * x3; 1494 s6 = sinpi_4_9 * x3; 1495 s7 = (tran_high_t)(x0 - x2 + x3); 1496 1497 s0 = s0 + s3 + s5; 1498 s1 = s1 - s4 - s6; 1499 s3 = s2; 1500 s2 = sinpi_3_9 * s7; 1501 1502 // 1-D transform scaling factor is sqrt(2). 1503 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 1504 // + 1b (addition) = 29b. 1505 // Hence the output bit depth is 15b. 1506 output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd); 1507 output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd); 1508 output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); 1509 output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd); 1510} 1511 1512void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { 1513 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1514 1515 tran_low_t x0 = input[7]; 1516 tran_low_t x1 = input[0]; 1517 tran_low_t x2 = input[5]; 1518 tran_low_t x3 = input[2]; 1519 tran_low_t x4 = input[3]; 1520 tran_low_t x5 = input[4]; 1521 tran_low_t x6 = input[1]; 1522 tran_low_t x7 = input[6]; 1523 (void) bd; 1524 1525 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 1526 memset(output, 0, 8 * sizeof(*output)); 1527 return; 1528 } 1529 1530 // stage 1 1531 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 1532 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 1533 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 1534 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 1535 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 1536 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 1537 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 1538 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 1539 1540 x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd); 1541 x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd); 1542 x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd); 1543 x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd); 1544 x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd); 1545 x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd); 1546 x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd); 1547 x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd); 1548 1549 // stage 2 1550 s0 = x0; 1551 s1 = x1; 1552 s2 = x2; 1553 s3 = x3; 1554 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 1555 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 1556 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 1557 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 1558 1559 x0 = WRAPLOW(s0 + s2, bd); 1560 x1 = WRAPLOW(s1 + s3, bd); 1561 x2 = WRAPLOW(s0 - s2, bd); 1562 x3 = WRAPLOW(s1 - s3, bd); 1563 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); 1564 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); 1565 x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); 1566 x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); 1567 1568 // stage 3 1569 s2 = cospi_16_64 * (x2 + x3); 1570 s3 = cospi_16_64 * (x2 - x3); 1571 s6 = cospi_16_64 * (x6 + x7); 1572 s7 = cospi_16_64 * (x6 - x7); 1573 1574 x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); 1575 x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); 1576 x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); 1577 x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); 1578 1579 output[0] = WRAPLOW(x0, bd); 1580 output[1] = WRAPLOW(-x4, bd); 1581 output[2] = WRAPLOW(x6, bd); 1582 output[3] = WRAPLOW(-x2, bd); 1583 output[4] = WRAPLOW(x3, bd); 1584 output[5] = WRAPLOW(-x7, bd); 1585 output[6] = WRAPLOW(x5, bd); 1586 output[7] = WRAPLOW(-x1, bd); 1587} 1588 1589void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, 1590 int stride, int bd) { 1591 tran_low_t out[8 * 8] = { 0 }; 1592 tran_low_t *outptr = out; 1593 int i, j; 1594 tran_low_t temp_in[8], temp_out[8]; 1595 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1596 1597 // First transform rows. 1598 // Only first 4 row has non-zero coefs. 1599 for (i = 0; i < 4; ++i) { 1600 vpx_highbd_idct8_c(input, outptr, bd); 1601 input += 8; 1602 outptr += 8; 1603 } 1604 // Then transform columns. 1605 for (i = 0; i < 8; ++i) { 1606 for (j = 0; j < 8; ++j) 1607 temp_in[j] = out[j * 8 + i]; 1608 vpx_highbd_idct8_c(temp_in, temp_out, bd); 1609 for (j = 0; j < 8; ++j) { 1610 dest[j * stride + i] = highbd_clip_pixel_add( 1611 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1612 } 1613 } 1614} 1615 1616void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { 1617 tran_low_t step1[16], step2[16]; 1618 tran_high_t temp1, temp2; 1619 (void) bd; 1620 1621 // stage 1 1622 step1[0] = input[0/2]; 1623 step1[1] = input[16/2]; 1624 step1[2] = input[8/2]; 1625 step1[3] = input[24/2]; 1626 step1[4] = input[4/2]; 1627 step1[5] = input[20/2]; 1628 step1[6] = input[12/2]; 1629 step1[7] = input[28/2]; 1630 step1[8] = input[2/2]; 1631 step1[9] = input[18/2]; 1632 step1[10] = input[10/2]; 1633 step1[11] = input[26/2]; 1634 step1[12] = input[6/2]; 1635 step1[13] = input[22/2]; 1636 step1[14] = input[14/2]; 1637 step1[15] = input[30/2]; 1638 1639 // stage 2 1640 step2[0] = step1[0]; 1641 step2[1] = step1[1]; 1642 step2[2] = step1[2]; 1643 step2[3] = step1[3]; 1644 step2[4] = step1[4]; 1645 step2[5] = step1[5]; 1646 step2[6] = step1[6]; 1647 step2[7] = step1[7]; 1648 1649 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 1650 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 1651 step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1652 step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1653 1654 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 1655 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 1656 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1657 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1658 1659 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 1660 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 1661 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1662 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1663 1664 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 1665 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 1666 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1667 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1668 1669 // stage 3 1670 step1[0] = step2[0]; 1671 step1[1] = step2[1]; 1672 step1[2] = step2[2]; 1673 step1[3] = step2[3]; 1674 1675 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 1676 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 1677 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1678 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1679 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 1680 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 1681 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1682 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1683 1684 step1[8] = WRAPLOW(step2[8] + step2[9], bd); 1685 step1[9] = WRAPLOW(step2[8] - step2[9], bd); 1686 step1[10] = WRAPLOW(-step2[10] + step2[11], bd); 1687 step1[11] = WRAPLOW(step2[10] + step2[11], bd); 1688 step1[12] = WRAPLOW(step2[12] + step2[13], bd); 1689 step1[13] = WRAPLOW(step2[12] - step2[13], bd); 1690 step1[14] = WRAPLOW(-step2[14] + step2[15], bd); 1691 step1[15] = WRAPLOW(step2[14] + step2[15], bd); 1692 1693 // stage 4 1694 temp1 = (step1[0] + step1[1]) * cospi_16_64; 1695 temp2 = (step1[0] - step1[1]) * cospi_16_64; 1696 step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1697 step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1698 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 1699 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 1700 step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1701 step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1702 step2[4] = WRAPLOW(step1[4] + step1[5], bd); 1703 step2[5] = WRAPLOW(step1[4] - step1[5], bd); 1704 step2[6] = WRAPLOW(-step1[6] + step1[7], bd); 1705 step2[7] = WRAPLOW(step1[6] + step1[7], bd); 1706 1707 step2[8] = step1[8]; 1708 step2[15] = step1[15]; 1709 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 1710 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 1711 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1712 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1713 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 1714 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 1715 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1716 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1717 step2[11] = step1[11]; 1718 step2[12] = step1[12]; 1719 1720 // stage 5 1721 step1[0] = WRAPLOW(step2[0] + step2[3], bd); 1722 step1[1] = WRAPLOW(step2[1] + step2[2], bd); 1723 step1[2] = WRAPLOW(step2[1] - step2[2], bd); 1724 step1[3] = WRAPLOW(step2[0] - step2[3], bd); 1725 step1[4] = step2[4]; 1726 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1727 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1728 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1729 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1730 step1[7] = step2[7]; 1731 1732 step1[8] = WRAPLOW(step2[8] + step2[11], bd); 1733 step1[9] = WRAPLOW(step2[9] + step2[10], bd); 1734 step1[10] = WRAPLOW(step2[9] - step2[10], bd); 1735 step1[11] = WRAPLOW(step2[8] - step2[11], bd); 1736 step1[12] = WRAPLOW(-step2[12] + step2[15], bd); 1737 step1[13] = WRAPLOW(-step2[13] + step2[14], bd); 1738 step1[14] = WRAPLOW(step2[13] + step2[14], bd); 1739 step1[15] = WRAPLOW(step2[12] + step2[15], bd); 1740 1741 // stage 6 1742 step2[0] = WRAPLOW(step1[0] + step1[7], bd); 1743 step2[1] = WRAPLOW(step1[1] + step1[6], bd); 1744 step2[2] = WRAPLOW(step1[2] + step1[5], bd); 1745 step2[3] = WRAPLOW(step1[3] + step1[4], bd); 1746 step2[4] = WRAPLOW(step1[3] - step1[4], bd); 1747 step2[5] = WRAPLOW(step1[2] - step1[5], bd); 1748 step2[6] = WRAPLOW(step1[1] - step1[6], bd); 1749 step2[7] = WRAPLOW(step1[0] - step1[7], bd); 1750 step2[8] = step1[8]; 1751 step2[9] = step1[9]; 1752 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 1753 temp2 = (step1[10] + step1[13]) * cospi_16_64; 1754 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1755 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1756 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 1757 temp2 = (step1[11] + step1[12]) * cospi_16_64; 1758 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1759 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1760 step2[14] = step1[14]; 1761 step2[15] = step1[15]; 1762 1763 // stage 7 1764 output[0] = WRAPLOW(step2[0] + step2[15], bd); 1765 output[1] = WRAPLOW(step2[1] + step2[14], bd); 1766 output[2] = WRAPLOW(step2[2] + step2[13], bd); 1767 output[3] = WRAPLOW(step2[3] + step2[12], bd); 1768 output[4] = WRAPLOW(step2[4] + step2[11], bd); 1769 output[5] = WRAPLOW(step2[5] + step2[10], bd); 1770 output[6] = WRAPLOW(step2[6] + step2[9], bd); 1771 output[7] = WRAPLOW(step2[7] + step2[8], bd); 1772 output[8] = WRAPLOW(step2[7] - step2[8], bd); 1773 output[9] = WRAPLOW(step2[6] - step2[9], bd); 1774 output[10] = WRAPLOW(step2[5] - step2[10], bd); 1775 output[11] = WRAPLOW(step2[4] - step2[11], bd); 1776 output[12] = WRAPLOW(step2[3] - step2[12], bd); 1777 output[13] = WRAPLOW(step2[2] - step2[13], bd); 1778 output[14] = WRAPLOW(step2[1] - step2[14], bd); 1779 output[15] = WRAPLOW(step2[0] - step2[15], bd); 1780} 1781 1782void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, 1783 int stride, int bd) { 1784 tran_low_t out[16 * 16]; 1785 tran_low_t *outptr = out; 1786 int i, j; 1787 tran_low_t temp_in[16], temp_out[16]; 1788 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1789 1790 // First transform rows. 1791 for (i = 0; i < 16; ++i) { 1792 vpx_highbd_idct16_c(input, outptr, bd); 1793 input += 16; 1794 outptr += 16; 1795 } 1796 1797 // Then transform columns. 1798 for (i = 0; i < 16; ++i) { 1799 for (j = 0; j < 16; ++j) 1800 temp_in[j] = out[j * 16 + i]; 1801 vpx_highbd_idct16_c(temp_in, temp_out, bd); 1802 for (j = 0; j < 16; ++j) { 1803 dest[j * stride + i] = highbd_clip_pixel_add( 1804 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 1805 } 1806 } 1807} 1808 1809void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { 1810 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 1811 tran_high_t s9, s10, s11, s12, s13, s14, s15; 1812 1813 tran_low_t x0 = input[15]; 1814 tran_low_t x1 = input[0]; 1815 tran_low_t x2 = input[13]; 1816 tran_low_t x3 = input[2]; 1817 tran_low_t x4 = input[11]; 1818 tran_low_t x5 = input[4]; 1819 tran_low_t x6 = input[9]; 1820 tran_low_t x7 = input[6]; 1821 tran_low_t x8 = input[7]; 1822 tran_low_t x9 = input[8]; 1823 tran_low_t x10 = input[5]; 1824 tran_low_t x11 = input[10]; 1825 tran_low_t x12 = input[3]; 1826 tran_low_t x13 = input[12]; 1827 tran_low_t x14 = input[1]; 1828 tran_low_t x15 = input[14]; 1829 (void) bd; 1830 1831 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 1832 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { 1833 memset(output, 0, 16 * sizeof(*output)); 1834 return; 1835 } 1836 1837 // stage 1 1838 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 1839 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 1840 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 1841 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 1842 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 1843 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 1844 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 1845 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 1846 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 1847 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 1848 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 1849 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 1850 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 1851 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 1852 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 1853 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 1854 1855 x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd); 1856 x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd); 1857 x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd); 1858 x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd); 1859 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd); 1860 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd); 1861 x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd); 1862 x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd); 1863 x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd); 1864 x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd); 1865 x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd); 1866 x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd); 1867 x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd); 1868 x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd); 1869 x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd); 1870 x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd); 1871 1872 // stage 2 1873 s0 = x0; 1874 s1 = x1; 1875 s2 = x2; 1876 s3 = x3; 1877 s4 = x4; 1878 s5 = x5; 1879 s6 = x6; 1880 s7 = x7; 1881 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 1882 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 1883 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 1884 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 1885 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; 1886 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 1887 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; 1888 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 1889 1890 x0 = WRAPLOW(s0 + s4, bd); 1891 x1 = WRAPLOW(s1 + s5, bd); 1892 x2 = WRAPLOW(s2 + s6, bd); 1893 x3 = WRAPLOW(s3 + s7, bd); 1894 x4 = WRAPLOW(s0 - s4, bd); 1895 x5 = WRAPLOW(s1 - s5, bd); 1896 x6 = WRAPLOW(s2 - s6, bd); 1897 x7 = WRAPLOW(s3 - s7, bd); 1898 x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd); 1899 x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd); 1900 x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd); 1901 x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd); 1902 x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd); 1903 x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd); 1904 x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd); 1905 x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd); 1906 1907 // stage 3 1908 s0 = x0; 1909 s1 = x1; 1910 s2 = x2; 1911 s3 = x3; 1912 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 1913 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 1914 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; 1915 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 1916 s8 = x8; 1917 s9 = x9; 1918 s10 = x10; 1919 s11 = x11; 1920 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 1921 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 1922 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; 1923 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 1924 1925 x0 = WRAPLOW(s0 + s2, bd); 1926 x1 = WRAPLOW(s1 + s3, bd); 1927 x2 = WRAPLOW(s0 - s2, bd); 1928 x3 = WRAPLOW(s1 - s3, bd); 1929 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); 1930 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); 1931 x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); 1932 x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); 1933 x8 = WRAPLOW(s8 + s10, bd); 1934 x9 = WRAPLOW(s9 + s11, bd); 1935 x10 = WRAPLOW(s8 - s10, bd); 1936 x11 = WRAPLOW(s9 - s11, bd); 1937 x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd); 1938 x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd); 1939 x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd); 1940 x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd); 1941 1942 // stage 4 1943 s2 = (- cospi_16_64) * (x2 + x3); 1944 s3 = cospi_16_64 * (x2 - x3); 1945 s6 = cospi_16_64 * (x6 + x7); 1946 s7 = cospi_16_64 * (-x6 + x7); 1947 s10 = cospi_16_64 * (x10 + x11); 1948 s11 = cospi_16_64 * (-x10 + x11); 1949 s14 = (- cospi_16_64) * (x14 + x15); 1950 s15 = cospi_16_64 * (x14 - x15); 1951 1952 x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); 1953 x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); 1954 x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); 1955 x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); 1956 x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd); 1957 x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd); 1958 x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd); 1959 x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd); 1960 1961 output[0] = WRAPLOW(x0, bd); 1962 output[1] = WRAPLOW(-x8, bd); 1963 output[2] = WRAPLOW(x12, bd); 1964 output[3] = WRAPLOW(-x4, bd); 1965 output[4] = WRAPLOW(x6, bd); 1966 output[5] = WRAPLOW(x14, bd); 1967 output[6] = WRAPLOW(x10, bd); 1968 output[7] = WRAPLOW(x2, bd); 1969 output[8] = WRAPLOW(x3, bd); 1970 output[9] = WRAPLOW(x11, bd); 1971 output[10] = WRAPLOW(x15, bd); 1972 output[11] = WRAPLOW(x7, bd); 1973 output[12] = WRAPLOW(x5, bd); 1974 output[13] = WRAPLOW(-x13, bd); 1975 output[14] = WRAPLOW(x9, bd); 1976 output[15] = WRAPLOW(-x1, bd); 1977} 1978 1979void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, 1980 int stride, int bd) { 1981 tran_low_t out[16 * 16] = { 0 }; 1982 tran_low_t *outptr = out; 1983 int i, j; 1984 tran_low_t temp_in[16], temp_out[16]; 1985 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1986 1987 // First transform rows. Since all non-zero dct coefficients are in 1988 // upper-left 4x4 area, we only need to calculate first 4 rows here. 1989 for (i = 0; i < 4; ++i) { 1990 vpx_highbd_idct16_c(input, outptr, bd); 1991 input += 16; 1992 outptr += 16; 1993 } 1994 1995 // Then transform columns. 1996 for (i = 0; i < 16; ++i) { 1997 for (j = 0; j < 16; ++j) 1998 temp_in[j] = out[j*16 + i]; 1999 vpx_highbd_idct16_c(temp_in, temp_out, bd); 2000 for (j = 0; j < 16; ++j) { 2001 dest[j * stride + i] = highbd_clip_pixel_add( 2002 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2003 } 2004 } 2005} 2006 2007void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, 2008 int stride, int bd) { 2009 int i, j; 2010 tran_high_t a1; 2011 tran_low_t out = WRAPLOW( 2012 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); 2013 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2014 2015 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); 2016 a1 = ROUND_POWER_OF_TWO(out, 6); 2017 for (j = 0; j < 16; ++j) { 2018 for (i = 0; i < 16; ++i) 2019 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 2020 dest += stride; 2021 } 2022} 2023 2024static void highbd_idct32_c(const tran_low_t *input, 2025 tran_low_t *output, int bd) { 2026 tran_low_t step1[32], step2[32]; 2027 tran_high_t temp1, temp2; 2028 (void) bd; 2029 2030 // stage 1 2031 step1[0] = input[0]; 2032 step1[1] = input[16]; 2033 step1[2] = input[8]; 2034 step1[3] = input[24]; 2035 step1[4] = input[4]; 2036 step1[5] = input[20]; 2037 step1[6] = input[12]; 2038 step1[7] = input[28]; 2039 step1[8] = input[2]; 2040 step1[9] = input[18]; 2041 step1[10] = input[10]; 2042 step1[11] = input[26]; 2043 step1[12] = input[6]; 2044 step1[13] = input[22]; 2045 step1[14] = input[14]; 2046 step1[15] = input[30]; 2047 2048 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 2049 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 2050 step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2051 step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2052 2053 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 2054 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 2055 step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2056 step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2057 2058 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 2059 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 2060 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2061 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2062 2063 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 2064 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 2065 step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2066 step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2067 2068 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 2069 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 2070 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2071 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2072 2073 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 2074 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 2075 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2076 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2077 2078 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 2079 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 2080 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2081 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2082 2083 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 2084 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 2085 step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2086 step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2087 2088 // stage 2 2089 step2[0] = step1[0]; 2090 step2[1] = step1[1]; 2091 step2[2] = step1[2]; 2092 step2[3] = step1[3]; 2093 step2[4] = step1[4]; 2094 step2[5] = step1[5]; 2095 step2[6] = step1[6]; 2096 step2[7] = step1[7]; 2097 2098 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 2099 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 2100 step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2101 step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2102 2103 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 2104 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 2105 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2106 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2107 2108 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 2109 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 2110 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2111 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2112 2113 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 2114 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 2115 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2116 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2117 2118 step2[16] = WRAPLOW(step1[16] + step1[17], bd); 2119 step2[17] = WRAPLOW(step1[16] - step1[17], bd); 2120 step2[18] = WRAPLOW(-step1[18] + step1[19], bd); 2121 step2[19] = WRAPLOW(step1[18] + step1[19], bd); 2122 step2[20] = WRAPLOW(step1[20] + step1[21], bd); 2123 step2[21] = WRAPLOW(step1[20] - step1[21], bd); 2124 step2[22] = WRAPLOW(-step1[22] + step1[23], bd); 2125 step2[23] = WRAPLOW(step1[22] + step1[23], bd); 2126 step2[24] = WRAPLOW(step1[24] + step1[25], bd); 2127 step2[25] = WRAPLOW(step1[24] - step1[25], bd); 2128 step2[26] = WRAPLOW(-step1[26] + step1[27], bd); 2129 step2[27] = WRAPLOW(step1[26] + step1[27], bd); 2130 step2[28] = WRAPLOW(step1[28] + step1[29], bd); 2131 step2[29] = WRAPLOW(step1[28] - step1[29], bd); 2132 step2[30] = WRAPLOW(-step1[30] + step1[31], bd); 2133 step2[31] = WRAPLOW(step1[30] + step1[31], bd); 2134 2135 // stage 3 2136 step1[0] = step2[0]; 2137 step1[1] = step2[1]; 2138 step1[2] = step2[2]; 2139 step1[3] = step2[3]; 2140 2141 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 2142 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 2143 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2144 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2145 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 2146 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 2147 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2148 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2149 2150 step1[8] = WRAPLOW(step2[8] + step2[9], bd); 2151 step1[9] = WRAPLOW(step2[8] - step2[9], bd); 2152 step1[10] = WRAPLOW(-step2[10] + step2[11], bd); 2153 step1[11] = WRAPLOW(step2[10] + step2[11], bd); 2154 step1[12] = WRAPLOW(step2[12] + step2[13], bd); 2155 step1[13] = WRAPLOW(step2[12] - step2[13], bd); 2156 step1[14] = WRAPLOW(-step2[14] + step2[15], bd); 2157 step1[15] = WRAPLOW(step2[14] + step2[15], bd); 2158 2159 step1[16] = step2[16]; 2160 step1[31] = step2[31]; 2161 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 2162 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 2163 step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2164 step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2165 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 2166 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 2167 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2168 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2169 step1[19] = step2[19]; 2170 step1[20] = step2[20]; 2171 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 2172 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 2173 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2174 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2175 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 2176 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 2177 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2178 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2179 step1[23] = step2[23]; 2180 step1[24] = step2[24]; 2181 step1[27] = step2[27]; 2182 step1[28] = step2[28]; 2183 2184 // stage 4 2185 temp1 = (step1[0] + step1[1]) * cospi_16_64; 2186 temp2 = (step1[0] - step1[1]) * cospi_16_64; 2187 step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2188 step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2189 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 2190 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 2191 step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2192 step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2193 step2[4] = WRAPLOW(step1[4] + step1[5], bd); 2194 step2[5] = WRAPLOW(step1[4] - step1[5], bd); 2195 step2[6] = WRAPLOW(-step1[6] + step1[7], bd); 2196 step2[7] = WRAPLOW(step1[6] + step1[7], bd); 2197 2198 step2[8] = step1[8]; 2199 step2[15] = step1[15]; 2200 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 2201 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 2202 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2203 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2204 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 2205 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 2206 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2207 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2208 step2[11] = step1[11]; 2209 step2[12] = step1[12]; 2210 2211 step2[16] = WRAPLOW(step1[16] + step1[19], bd); 2212 step2[17] = WRAPLOW(step1[17] + step1[18], bd); 2213 step2[18] = WRAPLOW(step1[17] - step1[18], bd); 2214 step2[19] = WRAPLOW(step1[16] - step1[19], bd); 2215 step2[20] = WRAPLOW(-step1[20] + step1[23], bd); 2216 step2[21] = WRAPLOW(-step1[21] + step1[22], bd); 2217 step2[22] = WRAPLOW(step1[21] + step1[22], bd); 2218 step2[23] = WRAPLOW(step1[20] + step1[23], bd); 2219 2220 step2[24] = WRAPLOW(step1[24] + step1[27], bd); 2221 step2[25] = WRAPLOW(step1[25] + step1[26], bd); 2222 step2[26] = WRAPLOW(step1[25] - step1[26], bd); 2223 step2[27] = WRAPLOW(step1[24] - step1[27], bd); 2224 step2[28] = WRAPLOW(-step1[28] + step1[31], bd); 2225 step2[29] = WRAPLOW(-step1[29] + step1[30], bd); 2226 step2[30] = WRAPLOW(step1[29] + step1[30], bd); 2227 step2[31] = WRAPLOW(step1[28] + step1[31], bd); 2228 2229 // stage 5 2230 step1[0] = WRAPLOW(step2[0] + step2[3], bd); 2231 step1[1] = WRAPLOW(step2[1] + step2[2], bd); 2232 step1[2] = WRAPLOW(step2[1] - step2[2], bd); 2233 step1[3] = WRAPLOW(step2[0] - step2[3], bd); 2234 step1[4] = step2[4]; 2235 temp1 = (step2[6] - step2[5]) * cospi_16_64; 2236 temp2 = (step2[5] + step2[6]) * cospi_16_64; 2237 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2238 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2239 step1[7] = step2[7]; 2240 2241 step1[8] = WRAPLOW(step2[8] + step2[11], bd); 2242 step1[9] = WRAPLOW(step2[9] + step2[10], bd); 2243 step1[10] = WRAPLOW(step2[9] - step2[10], bd); 2244 step1[11] = WRAPLOW(step2[8] - step2[11], bd); 2245 step1[12] = WRAPLOW(-step2[12] + step2[15], bd); 2246 step1[13] = WRAPLOW(-step2[13] + step2[14], bd); 2247 step1[14] = WRAPLOW(step2[13] + step2[14], bd); 2248 step1[15] = WRAPLOW(step2[12] + step2[15], bd); 2249 2250 step1[16] = step2[16]; 2251 step1[17] = step2[17]; 2252 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 2253 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 2254 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2255 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2256 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 2257 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 2258 step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2259 step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2260 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 2261 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 2262 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2263 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2264 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 2265 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 2266 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2267 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2268 step1[22] = step2[22]; 2269 step1[23] = step2[23]; 2270 step1[24] = step2[24]; 2271 step1[25] = step2[25]; 2272 step1[30] = step2[30]; 2273 step1[31] = step2[31]; 2274 2275 // stage 6 2276 step2[0] = WRAPLOW(step1[0] + step1[7], bd); 2277 step2[1] = WRAPLOW(step1[1] + step1[6], bd); 2278 step2[2] = WRAPLOW(step1[2] + step1[5], bd); 2279 step2[3] = WRAPLOW(step1[3] + step1[4], bd); 2280 step2[4] = WRAPLOW(step1[3] - step1[4], bd); 2281 step2[5] = WRAPLOW(step1[2] - step1[5], bd); 2282 step2[6] = WRAPLOW(step1[1] - step1[6], bd); 2283 step2[7] = WRAPLOW(step1[0] - step1[7], bd); 2284 step2[8] = step1[8]; 2285 step2[9] = step1[9]; 2286 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 2287 temp2 = (step1[10] + step1[13]) * cospi_16_64; 2288 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2289 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2290 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 2291 temp2 = (step1[11] + step1[12]) * cospi_16_64; 2292 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2293 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2294 step2[14] = step1[14]; 2295 step2[15] = step1[15]; 2296 2297 step2[16] = WRAPLOW(step1[16] + step1[23], bd); 2298 step2[17] = WRAPLOW(step1[17] + step1[22], bd); 2299 step2[18] = WRAPLOW(step1[18] + step1[21], bd); 2300 step2[19] = WRAPLOW(step1[19] + step1[20], bd); 2301 step2[20] = WRAPLOW(step1[19] - step1[20], bd); 2302 step2[21] = WRAPLOW(step1[18] - step1[21], bd); 2303 step2[22] = WRAPLOW(step1[17] - step1[22], bd); 2304 step2[23] = WRAPLOW(step1[16] - step1[23], bd); 2305 2306 step2[24] = WRAPLOW(-step1[24] + step1[31], bd); 2307 step2[25] = WRAPLOW(-step1[25] + step1[30], bd); 2308 step2[26] = WRAPLOW(-step1[26] + step1[29], bd); 2309 step2[27] = WRAPLOW(-step1[27] + step1[28], bd); 2310 step2[28] = WRAPLOW(step1[27] + step1[28], bd); 2311 step2[29] = WRAPLOW(step1[26] + step1[29], bd); 2312 step2[30] = WRAPLOW(step1[25] + step1[30], bd); 2313 step2[31] = WRAPLOW(step1[24] + step1[31], bd); 2314 2315 // stage 7 2316 step1[0] = WRAPLOW(step2[0] + step2[15], bd); 2317 step1[1] = WRAPLOW(step2[1] + step2[14], bd); 2318 step1[2] = WRAPLOW(step2[2] + step2[13], bd); 2319 step1[3] = WRAPLOW(step2[3] + step2[12], bd); 2320 step1[4] = WRAPLOW(step2[4] + step2[11], bd); 2321 step1[5] = WRAPLOW(step2[5] + step2[10], bd); 2322 step1[6] = WRAPLOW(step2[6] + step2[9], bd); 2323 step1[7] = WRAPLOW(step2[7] + step2[8], bd); 2324 step1[8] = WRAPLOW(step2[7] - step2[8], bd); 2325 step1[9] = WRAPLOW(step2[6] - step2[9], bd); 2326 step1[10] = WRAPLOW(step2[5] - step2[10], bd); 2327 step1[11] = WRAPLOW(step2[4] - step2[11], bd); 2328 step1[12] = WRAPLOW(step2[3] - step2[12], bd); 2329 step1[13] = WRAPLOW(step2[2] - step2[13], bd); 2330 step1[14] = WRAPLOW(step2[1] - step2[14], bd); 2331 step1[15] = WRAPLOW(step2[0] - step2[15], bd); 2332 2333 step1[16] = step2[16]; 2334 step1[17] = step2[17]; 2335 step1[18] = step2[18]; 2336 step1[19] = step2[19]; 2337 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 2338 temp2 = (step2[20] + step2[27]) * cospi_16_64; 2339 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2340 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2341 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 2342 temp2 = (step2[21] + step2[26]) * cospi_16_64; 2343 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2344 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2345 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 2346 temp2 = (step2[22] + step2[25]) * cospi_16_64; 2347 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2348 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2349 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 2350 temp2 = (step2[23] + step2[24]) * cospi_16_64; 2351 step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2352 step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2353 step1[28] = step2[28]; 2354 step1[29] = step2[29]; 2355 step1[30] = step2[30]; 2356 step1[31] = step2[31]; 2357 2358 // final stage 2359 output[0] = WRAPLOW(step1[0] + step1[31], bd); 2360 output[1] = WRAPLOW(step1[1] + step1[30], bd); 2361 output[2] = WRAPLOW(step1[2] + step1[29], bd); 2362 output[3] = WRAPLOW(step1[3] + step1[28], bd); 2363 output[4] = WRAPLOW(step1[4] + step1[27], bd); 2364 output[5] = WRAPLOW(step1[5] + step1[26], bd); 2365 output[6] = WRAPLOW(step1[6] + step1[25], bd); 2366 output[7] = WRAPLOW(step1[7] + step1[24], bd); 2367 output[8] = WRAPLOW(step1[8] + step1[23], bd); 2368 output[9] = WRAPLOW(step1[9] + step1[22], bd); 2369 output[10] = WRAPLOW(step1[10] + step1[21], bd); 2370 output[11] = WRAPLOW(step1[11] + step1[20], bd); 2371 output[12] = WRAPLOW(step1[12] + step1[19], bd); 2372 output[13] = WRAPLOW(step1[13] + step1[18], bd); 2373 output[14] = WRAPLOW(step1[14] + step1[17], bd); 2374 output[15] = WRAPLOW(step1[15] + step1[16], bd); 2375 output[16] = WRAPLOW(step1[15] - step1[16], bd); 2376 output[17] = WRAPLOW(step1[14] - step1[17], bd); 2377 output[18] = WRAPLOW(step1[13] - step1[18], bd); 2378 output[19] = WRAPLOW(step1[12] - step1[19], bd); 2379 output[20] = WRAPLOW(step1[11] - step1[20], bd); 2380 output[21] = WRAPLOW(step1[10] - step1[21], bd); 2381 output[22] = WRAPLOW(step1[9] - step1[22], bd); 2382 output[23] = WRAPLOW(step1[8] - step1[23], bd); 2383 output[24] = WRAPLOW(step1[7] - step1[24], bd); 2384 output[25] = WRAPLOW(step1[6] - step1[25], bd); 2385 output[26] = WRAPLOW(step1[5] - step1[26], bd); 2386 output[27] = WRAPLOW(step1[4] - step1[27], bd); 2387 output[28] = WRAPLOW(step1[3] - step1[28], bd); 2388 output[29] = WRAPLOW(step1[2] - step1[29], bd); 2389 output[30] = WRAPLOW(step1[1] - step1[30], bd); 2390 output[31] = WRAPLOW(step1[0] - step1[31], bd); 2391} 2392 2393void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, 2394 int stride, int bd) { 2395 tran_low_t out[32 * 32]; 2396 tran_low_t *outptr = out; 2397 int i, j; 2398 tran_low_t temp_in[32], temp_out[32]; 2399 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2400 2401 // Rows 2402 for (i = 0; i < 32; ++i) { 2403 tran_low_t zero_coeff[16]; 2404 for (j = 0; j < 16; ++j) 2405 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; 2406 for (j = 0; j < 8; ++j) 2407 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2408 for (j = 0; j < 4; ++j) 2409 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2410 for (j = 0; j < 2; ++j) 2411 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2412 2413 if (zero_coeff[0] | zero_coeff[1]) 2414 highbd_idct32_c(input, outptr, bd); 2415 else 2416 memset(outptr, 0, sizeof(tran_low_t) * 32); 2417 input += 32; 2418 outptr += 32; 2419 } 2420 2421 // Columns 2422 for (i = 0; i < 32; ++i) { 2423 for (j = 0; j < 32; ++j) 2424 temp_in[j] = out[j * 32 + i]; 2425 highbd_idct32_c(temp_in, temp_out, bd); 2426 for (j = 0; j < 32; ++j) { 2427 dest[j * stride + i] = highbd_clip_pixel_add( 2428 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2429 } 2430 } 2431} 2432 2433void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, 2434 int stride, int bd) { 2435 tran_low_t out[32 * 32] = {0}; 2436 tran_low_t *outptr = out; 2437 int i, j; 2438 tran_low_t temp_in[32], temp_out[32]; 2439 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2440 2441 // Rows 2442 // Only upper-left 8x8 has non-zero coeff. 2443 for (i = 0; i < 8; ++i) { 2444 highbd_idct32_c(input, outptr, bd); 2445 input += 32; 2446 outptr += 32; 2447 } 2448 // Columns 2449 for (i = 0; i < 32; ++i) { 2450 for (j = 0; j < 32; ++j) 2451 temp_in[j] = out[j * 32 + i]; 2452 highbd_idct32_c(temp_in, temp_out, bd); 2453 for (j = 0; j < 32; ++j) { 2454 dest[j * stride + i] = highbd_clip_pixel_add( 2455 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2456 } 2457 } 2458} 2459 2460void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, 2461 int stride, int bd) { 2462 int i, j; 2463 int a1; 2464 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2465 2466 tran_low_t out = WRAPLOW( 2467 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); 2468 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); 2469 a1 = ROUND_POWER_OF_TWO(out, 6); 2470 2471 for (j = 0; j < 32; ++j) { 2472 for (i = 0; i < 32; ++i) 2473 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 2474 dest += stride; 2475 } 2476} 2477#endif // CONFIG_VP9_HIGHBITDEPTH 2478