1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/*
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <assert.h>
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <math.h>
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "./vpx_config.h"
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "./vp9_rtcd.h"
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_systemdependent.h"
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_blockd.h"
18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_common.h"
19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_idct.h"
20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang   0.5 shifts per pixel. */
24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t output[16];
26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int a1, b1, c1, d1, e1;
275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int16_t *ip = input;
28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *op = output;
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; i++) {
315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    a1 = ip[0] >> UNIT_QUANT_SHIFT;
325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    c1 = ip[1] >> UNIT_QUANT_SHIFT;
335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    d1 = ip[2] >> UNIT_QUANT_SHIFT;
345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    b1 = ip[3] >> UNIT_QUANT_SHIFT;
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    a1 += c1;
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    d1 -= b1;
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    e1 = (a1 - d1) >> 1;
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    b1 = e1 - b1;
39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    c1 = e1 - c1;
40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    a1 -= b1;
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    d1 += c1;
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    op[0] = a1;
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    op[1] = b1;
44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    op[2] = c1;
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    op[3] = d1;
46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ip += 4;
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    op += 4;
48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  ip = output;
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; i++) {
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    a1 = ip[4 * 0];
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    c1 = ip[4 * 1];
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    d1 = ip[4 * 2];
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    b1 = ip[4 * 3];
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    a1 += c1;
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    d1 -= b1;
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    e1 = (a1 - d1) >> 1;
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    b1 = e1 - b1;
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    c1 = e1 - c1;
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    a1 -= b1;
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    d1 += c1;
635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ip++;
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest++;
70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int a1, e1;
76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t tmp[4];
775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int16_t *ip = in;
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *op = tmp;
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  a1 = ip[0] >> UNIT_QUANT_SHIFT;
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  e1 = a1 >> 1;
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  a1 -= e1;
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  op[0] = a1;
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  op[1] = op[2] = op[3] = e1;
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  ip = tmp;
87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; i++) {
88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    e1 = ip[0] >> 1;
89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    a1 = ip[0] - e1;
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ip++;
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest++;
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
99b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct4(const int16_t *input, int16_t *output) {
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t step[4];
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int temp1, temp2;
102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 1
103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (input[0] + input[2]) * cospi_16_64;
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (input[0] - input[2]) * cospi_16_64;
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[0] = dct_const_round_shift(temp1);
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[1] = dct_const_round_shift(temp2);
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[2] = dct_const_round_shift(temp1);
110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[3] = dct_const_round_shift(temp2);
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 2
113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] = step[0] + step[3];
114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = step[1] + step[2];
115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] = step[1] - step[2];
116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = step[0] - step[3];
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t out[4 * 4];
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *outptr = out;
122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t temp_in[4], temp_out[4];
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; ++i) {
127b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct4(input, outptr);
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    input += 4;
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    outptr += 4;
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns
133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; ++i) {
134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 4; ++j)
135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp_in[j] = out[j * 4 + i];
136b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct4(temp_in, temp_out);
137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 4; ++j)
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  + dest[j * stride + i]);
140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int a1;
146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  out = dct_const_round_shift(out * cospi_16_64);
148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  a1 = ROUND_POWER_OF_TWO(out, 4);
149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; i++) {
151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest[0] = clip_pixel(dest[0] + a1);
152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest[1] = clip_pixel(dest[1] + a1);
153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest[2] = clip_pixel(dest[2] + a1);
154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest[3] = clip_pixel(dest[3] + a1);
155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dest += dest_stride;
156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
159b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct8(const int16_t *input, int16_t *output) {
160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t step1[8], step2[8];
161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int temp1, temp2;
162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 1
163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[0] = input[0];
164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[2] = input[4];
165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[1] = input[2];
166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[3] = input[6];
167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[4] = dct_const_round_shift(temp1);
170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[7] = dct_const_round_shift(temp2);
171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[5] = dct_const_round_shift(temp1);
174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[6] = dct_const_round_shift(temp2);
175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 2 & stage 3 - even half
177b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  idct4(step1, step1);
178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 2 - odd half
180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[4] = step1[4] + step1[5];
181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[5] = step1[4] - step1[5];
182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[6] = -step1[6] + step1[7];
183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[7] = step1[6] + step1[7];
184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 3 -odd half
186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[4] = step2[4];
187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (step2[6] - step2[5]) * cospi_16_64;
188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step2[5] + step2[6]) * cospi_16_64;
189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[5] = dct_const_round_shift(temp1);
190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[6] = dct_const_round_shift(temp2);
191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[7] = step2[7];
192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 4
194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] = step1[0] + step1[7];
195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = step1[1] + step1[6];
196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] = step1[2] + step1[5];
197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = step1[3] + step1[4];
198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[4] = step1[3] - step1[4];
199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[5] = step1[2] - step1[5];
200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[6] = step1[1] - step1[6];
201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[7] = step1[0] - step1[7];
202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t out[8 * 8];
206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *outptr = out;
207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t temp_in[8], temp_out[8];
209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // First transform rows
211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 8; ++i) {
212b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct8(input, outptr);
213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    input += 8;
214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    outptr += 8;
215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Then transform columns
218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 8; ++i) {
219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 8; ++j)
220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp_in[j] = out[j * 8 + i];
221b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct8(temp_in, temp_out);
222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 8; ++j)
2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  + dest[j * stride + i]);
225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
229f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int i, j;
230f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int a1;
231f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
232f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  out = dct_const_round_shift(out * cospi_16_64);
233f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a1 = ROUND_POWER_OF_TWO(out, 5);
234f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  for (j = 0; j < 8; ++j) {
235f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    for (i = 0; i < 8; ++i)
236f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      dest[i] = clip_pixel(dest[i] + a1);
2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dest += stride;
238f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  }
239f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang}
240f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
241b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst4(const int16_t *input, int16_t *output) {
242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int s0, s1, s2, s3, s4, s5, s6, s7;
243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x0 = input[0];
245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x1 = input[1];
246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x2 = input[2];
247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x3 = input[3];
248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  if (!(x0 | x1 | x2 | x3)) {
250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    output[0] = output[1] = output[2] = output[3] = 0;
251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    return;
252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = sinpi_1_9 * x0;
255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = sinpi_2_9 * x0;
256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = sinpi_3_9 * x1;
257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = sinpi_4_9 * x2;
258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = sinpi_1_9 * x2;
259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = sinpi_2_9 * x3;
260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = sinpi_4_9 * x3;
261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = x0 - x2 + x3;
262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = s0 + s3 + s5;
264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = s1 - s4 - s6;
265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = sinpi_3_9 * s7;
266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = s2;
267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = x0 + x3;
269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = x1 + x3;
270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = x2;
271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = x0 + x1 - x3;
272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 1-D transform scaling factor is sqrt(2).
274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // + 1b (addition) = 29b.
276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Hence the output bit depth is 15b.
277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] = dct_const_round_shift(s0);
278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = dct_const_round_shift(s1);
279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] = dct_const_round_shift(s2);
280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = dct_const_round_shift(s3);
281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         int tx_type) {
285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const transform_2d IHT_4[] = {
286b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    { idct4, idct4  },  // DCT_DCT  = 0
287b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    { iadst4, idct4  },   // ADST_DCT = 1
288b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    { idct4, iadst4 },    // DCT_ADST = 2
289b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    { iadst4, iadst4 }      // ADST_ADST = 3
290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  };
291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t out[4 * 4];
294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *outptr = out;
295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t temp_in[4], temp_out[4];
296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // inverse transform row vectors
298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; ++i) {
299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    IHT_4[tx_type].rows(input, outptr);
300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    input  += 4;
301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    outptr += 4;
302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // inverse transform column vectors
305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; ++i) {
306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 4; ++j)
307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp_in[j] = out[j * 4 + i];
308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    IHT_4[tx_type].cols(temp_in, temp_out);
309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 4; ++j)
3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
3115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  + dest[j * stride + i]);
312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst8(const int16_t *input, int16_t *output) {
315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int s0, s1, s2, s3, s4, s5, s6, s7;
316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x0 = input[7];
318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x1 = input[0];
319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x2 = input[5];
320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x3 = input[2];
321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x4 = input[3];
322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x5 = input[4];
323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x6 = input[1];
324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x7 = input[6];
325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    output[0] = output[1] = output[2] = output[3] = output[4]
328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang              = output[5] = output[6] = output[7] = 0;
329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    return;
330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 1
333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = dct_const_round_shift(s0 + s4);
343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = dct_const_round_shift(s1 + s5);
344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = dct_const_round_shift(s2 + s6);
345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = dct_const_round_shift(s3 + s7);
346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x4 = dct_const_round_shift(s0 - s4);
347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x5 = dct_const_round_shift(s1 - s5);
348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x6 = dct_const_round_shift(s2 - s6);
349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x7 = dct_const_round_shift(s3 - s7);
350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 2
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = x0;
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = x1;
354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = x2;
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = x3;
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = s0 + s2;
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = s1 + s3;
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = s0 - s2;
364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = s1 - s3;
365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x4 = dct_const_round_shift(s4 + s6);
366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x5 = dct_const_round_shift(s5 + s7);
367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x6 = dct_const_round_shift(s4 - s6);
368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x7 = dct_const_round_shift(s5 - s7);
369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 3
371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = cospi_16_64 * (x2 + x3);
372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = cospi_16_64 * (x2 - x3);
373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = cospi_16_64 * (x6 + x7);
374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = cospi_16_64 * (x6 - x7);
375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = dct_const_round_shift(s2);
377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = dct_const_round_shift(s3);
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x6 = dct_const_round_shift(s6);
379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x7 = dct_const_round_shift(s7);
380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] =  x0;
382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = -x4;
383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] =  x6;
384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = -x2;
385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[4] =  x3;
386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[5] = -x7;
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[6] =  x5;
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[7] = -x1;
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangstatic const transform_2d IHT_8[] = {
392b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  { idct8,  idct8  },  // DCT_DCT  = 0
393b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  { iadst8, idct8  },  // ADST_DCT = 1
394b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  { idct8,  iadst8 },  // DCT_ADST = 2
395b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  { iadst8, iadst8 }   // ADST_ADST = 3
396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang};
397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
3985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
3995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         int tx_type) {
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t out[8 * 8];
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *outptr = out;
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t temp_in[8], temp_out[8];
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const transform_2d ht = IHT_8[tx_type];
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // inverse transform row vectors
407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 8; ++i) {
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ht.rows(input, outptr);
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    input += 8;
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    outptr += 8;
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // inverse transform column vectors
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 8; ++i) {
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 8; ++j)
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp_in[j] = out[j * 8 + i];
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ht.cols(temp_in, temp_out);
418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 8; ++j)
4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  + dest[j * stride + i]);
4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
4245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
42591037db265ecdd914a26e056cf69207b4f50924ehkuang  int16_t out[8 * 8] = { 0 };
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *outptr = out;
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t temp_in[8], temp_out[8];
429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // First transform rows
431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // only first 4 row has non-zero coefs
432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; ++i) {
433b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct8(input, outptr);
434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    input += 8;
435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    outptr += 8;
436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Then transform columns
439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 8; ++i) {
440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 8; ++j)
441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp_in[j] = out[j * 8 + i];
442b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct8(temp_in, temp_out);
443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 8; ++j)
4445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
4455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  + dest[j * stride + i]);
446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
449b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct16(const int16_t *input, int16_t *output) {
450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t step1[16], step2[16];
451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int temp1, temp2;
452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 1
454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[0] = input[0/2];
455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[1] = input[16/2];
456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[2] = input[8/2];
457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[3] = input[24/2];
458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[4] = input[4/2];
459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[5] = input[20/2];
460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[6] = input[12/2];
461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[7] = input[28/2];
462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[8] = input[2/2];
463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[9] = input[18/2];
464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[10] = input[10/2];
465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[11] = input[26/2];
466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[12] = input[6/2];
467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[13] = input[22/2];
468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[14] = input[14/2];
469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[15] = input[30/2];
470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 2
472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[0] = step1[0];
473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[1] = step1[1];
474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[2] = step1[2];
475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[3] = step1[3];
476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[4] = step1[4];
477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[5] = step1[5];
478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[6] = step1[6];
479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[7] = step1[7];
480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[8] = dct_const_round_shift(temp1);
484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[15] = dct_const_round_shift(temp2);
485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[9] = dct_const_round_shift(temp1);
489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[14] = dct_const_round_shift(temp2);
490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[10] = dct_const_round_shift(temp1);
494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[13] = dct_const_round_shift(temp2);
495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[11] = dct_const_round_shift(temp1);
499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[12] = dct_const_round_shift(temp2);
500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 3
502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[0] = step2[0];
503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[1] = step2[1];
504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[2] = step2[2];
505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[3] = step2[3];
506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[4] = dct_const_round_shift(temp1);
510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[7] = dct_const_round_shift(temp2);
511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[5] = dct_const_round_shift(temp1);
514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[6] = dct_const_round_shift(temp2);
515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[8] = step2[8] + step2[9];
517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[9] = step2[8] - step2[9];
518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[10] = -step2[10] + step2[11];
519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[11] = step2[10] + step2[11];
520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[12] = step2[12] + step2[13];
521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[13] = step2[12] - step2[13];
522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[14] = -step2[14] + step2[15];
523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[15] = step2[14] + step2[15];
524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
52591037db265ecdd914a26e056cf69207b4f50924ehkuang  // stage 4
526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (step1[0] + step1[1]) * cospi_16_64;
527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step1[0] - step1[1]) * cospi_16_64;
528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[0] = dct_const_round_shift(temp1);
529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[1] = dct_const_round_shift(temp2);
530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[2] = dct_const_round_shift(temp1);
533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[3] = dct_const_round_shift(temp2);
534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[4] = step1[4] + step1[5];
535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[5] = step1[4] - step1[5];
536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[6] = -step1[6] + step1[7];
537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[7] = step1[6] + step1[7];
538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[8] = step1[8];
540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[15] = step1[15];
541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[9] = dct_const_round_shift(temp1);
544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[14] = dct_const_round_shift(temp2);
545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[10] = dct_const_round_shift(temp1);
548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[13] = dct_const_round_shift(temp2);
549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[11] = step1[11];
550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[12] = step1[12];
551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 5
553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[0] = step2[0] + step2[3];
554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[1] = step2[1] + step2[2];
555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[2] = step2[1] - step2[2];
556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[3] = step2[0] - step2[3];
557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[4] = step2[4];
558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (step2[6] - step2[5]) * cospi_16_64;
559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step2[5] + step2[6]) * cospi_16_64;
560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[5] = dct_const_round_shift(temp1);
561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[6] = dct_const_round_shift(temp2);
562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[7] = step2[7];
563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[8] = step2[8] + step2[11];
565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[9] = step2[9] + step2[10];
566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[10] = step2[9] - step2[10];
567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[11] = step2[8] - step2[11];
568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[12] = -step2[12] + step2[15];
569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[13] = -step2[13] + step2[14];
570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[14] = step2[13] + step2[14];
571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[15] = step2[12] + step2[15];
572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 6
574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[0] = step1[0] + step1[7];
575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[1] = step1[1] + step1[6];
576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[2] = step1[2] + step1[5];
577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[3] = step1[3] + step1[4];
578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[4] = step1[3] - step1[4];
579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[5] = step1[2] - step1[5];
580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[6] = step1[1] - step1[6];
581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[7] = step1[0] - step1[7];
582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[8] = step1[8];
583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[9] = step1[9];
584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step1[10] + step1[13]) * cospi_16_64;
586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[10] = dct_const_round_shift(temp1);
587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[13] = dct_const_round_shift(temp2);
588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step1[11] + step1[12]) * cospi_16_64;
590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[11] = dct_const_round_shift(temp1);
591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[12] = dct_const_round_shift(temp2);
592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[14] = step1[14];
593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[15] = step1[15];
594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 7
596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] = step2[0] + step2[15];
597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = step2[1] + step2[14];
598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] = step2[2] + step2[13];
599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = step2[3] + step2[12];
600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[4] = step2[4] + step2[11];
601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[5] = step2[5] + step2[10];
602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[6] = step2[6] + step2[9];
603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[7] = step2[7] + step2[8];
604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[8] = step2[7] - step2[8];
605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[9] = step2[6] - step2[9];
606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[10] = step2[5] - step2[10];
607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[11] = step2[4] - step2[11];
608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[12] = step2[3] - step2[12];
609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[13] = step2[2] - step2[13];
610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[14] = step2[1] - step2[14];
611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[15] = step2[0] - step2[15];
612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
613ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
6145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
615ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t out[16 * 16];
616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *outptr = out;
617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t temp_in[16], temp_out[16];
619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // First transform rows
621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 16; ++i) {
622b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct16(input, outptr);
623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    input += 16;
624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    outptr += 16;
625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Then transform columns
628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 16; ++i) {
629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 16; ++j)
630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp_in[j] = out[j * 16 + i];
631b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct16(temp_in, temp_out);
632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 16; ++j)
6335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
6345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  + dest[j * stride + i]);
635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
638b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void iadst16(const int16_t *input, int16_t *output) {
639ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
640ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x0 = input[15];
642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x1 = input[0];
643ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x2 = input[13];
644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x3 = input[2];
645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x4 = input[11];
646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x5 = input[4];
647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x6 = input[9];
648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x7 = input[6];
649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x8 = input[7];
650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x9 = input[8];
651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x10 = input[5];
652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x11 = input[10];
653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x12 = input[3];
654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x13 = input[12];
655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x14 = input[1];
656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x15 = input[14];
657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    output[0] = output[1] = output[2] = output[3] = output[4]
661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang              = output[5] = output[6] = output[7] = output[8]
662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang              = output[9] = output[10] = output[11] = output[12]
663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang              = output[13] = output[14] = output[15] = 0;
664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    return;
665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 1
668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = dct_const_round_shift(s0 + s8);
686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = dct_const_round_shift(s1 + s9);
687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = dct_const_round_shift(s2 + s10);
688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = dct_const_round_shift(s3 + s11);
689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x4 = dct_const_round_shift(s4 + s12);
690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x5 = dct_const_round_shift(s5 + s13);
691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x6 = dct_const_round_shift(s6 + s14);
692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x7 = dct_const_round_shift(s7 + s15);
693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x8  = dct_const_round_shift(s0 - s8);
694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x9  = dct_const_round_shift(s1 - s9);
695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x10 = dct_const_round_shift(s2 - s10);
696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x11 = dct_const_round_shift(s3 - s11);
697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x12 = dct_const_round_shift(s4 - s12);
698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x13 = dct_const_round_shift(s5 - s13);
699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x14 = dct_const_round_shift(s6 - s14);
700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x15 = dct_const_round_shift(s7 - s15);
701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 2
703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = x0;
704ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = x1;
705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = x2;
706ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = x3;
707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = x4;
708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = x5;
709ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = x6;
710ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = x7;
711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
714ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
715ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
716ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
717ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
718ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
719ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
720ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = s0 + s4;
721ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = s1 + s5;
722ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = s2 + s6;
723ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = s3 + s7;
724ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x4 = s0 - s4;
725ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x5 = s1 - s5;
726ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x6 = s2 - s6;
727ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x7 = s3 - s7;
728ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x8 = dct_const_round_shift(s8 + s12);
729ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x9 = dct_const_round_shift(s9 + s13);
730ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x10 = dct_const_round_shift(s10 + s14);
731ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x11 = dct_const_round_shift(s11 + s15);
732ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x12 = dct_const_round_shift(s8 - s12);
733ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x13 = dct_const_round_shift(s9 - s13);
734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x14 = dct_const_round_shift(s10 - s14);
735ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x15 = dct_const_round_shift(s11 - s15);
736ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
737ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 3
738ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = x0;
739ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = x1;
740ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = x2;
741ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = x3;
742ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
743ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
744ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
745ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
746ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s8 = x8;
747ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s9 = x9;
748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s10 = x10;
749ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s11 = x11;
750ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = s0 + s2;
756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = s1 + s3;
757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = s0 - s2;
758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = s1 - s3;
759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x4 = dct_const_round_shift(s4 + s6);
760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x5 = dct_const_round_shift(s5 + s7);
761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x6 = dct_const_round_shift(s4 - s6);
762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x7 = dct_const_round_shift(s5 - s7);
763ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x8 = s8 + s10;
764ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x9 = s9 + s11;
765ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x10 = s8 - s10;
766ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x11 = s9 - s11;
767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x12 = dct_const_round_shift(s12 + s14);
768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x13 = dct_const_round_shift(s13 + s15);
769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x14 = dct_const_round_shift(s12 - s14);
770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x15 = dct_const_round_shift(s13 - s15);
771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
772ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 4
773ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = (- cospi_16_64) * (x2 + x3);
774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = cospi_16_64 * (x2 - x3);
775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = cospi_16_64 * (x6 + x7);
776ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = cospi_16_64 * (- x6 + x7);
777ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s10 = cospi_16_64 * (x10 + x11);
778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s11 = cospi_16_64 * (- x10 + x11);
779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s14 = (- cospi_16_64) * (x14 + x15);
780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s15 = cospi_16_64 * (x14 - x15);
781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
782ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = dct_const_round_shift(s2);
783ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = dct_const_round_shift(s3);
784ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x6 = dct_const_round_shift(s6);
785ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x7 = dct_const_round_shift(s7);
786ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x10 = dct_const_round_shift(s10);
787ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x11 = dct_const_round_shift(s11);
788ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x14 = dct_const_round_shift(s14);
789ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x15 = dct_const_round_shift(s15);
790ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
791ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] =  x0;
792ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = -x8;
793ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] =  x12;
794ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = -x4;
795ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[4] =  x6;
796ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[5] =  x14;
797ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[6] =  x10;
798ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[7] =  x2;
799ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[8] =  x3;
800ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[9] =  x11;
801ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[10] =  x15;
802ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[11] =  x7;
803ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[12] =  x5;
804ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[13] = -x13;
805ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[14] =  x9;
806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[15] = -x1;
807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
808ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
809ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangstatic const transform_2d IHT_16[] = {
810b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  { idct16,  idct16  },  // DCT_DCT  = 0
811b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  { iadst16, idct16  },  // ADST_DCT = 1
812b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  { idct16,  iadst16 },  // DCT_ADST = 2
813b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  { iadst16, iadst16 }   // ADST_ADST = 3
814ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang};
815ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
8165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
8175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            int tx_type) {
818ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
819ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t out[16 * 16];
820ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *outptr = out;
821ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t temp_in[16], temp_out[16];
822ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  const transform_2d ht = IHT_16[tx_type];
823ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
824ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows
825ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 16; ++i) {
826ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ht.rows(input, outptr);
827ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    input += 16;
828ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    outptr += 16;
829ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
830ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
831ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns
832ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 16; ++i) {
833ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 16; ++j)
834ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp_in[j] = out[j * 16 + i];
835ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ht.cols(temp_in, temp_out);
836ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 16; ++j)
8375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
838b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                        + dest[j * stride + i]);
839b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  }
840ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
841ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
8425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
84391037db265ecdd914a26e056cf69207b4f50924ehkuang  int16_t out[16 * 16] = { 0 };
844ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *outptr = out;
845ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
846ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t temp_in[16], temp_out[16];
847ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
84891037db265ecdd914a26e056cf69207b4f50924ehkuang  // First transform rows. Since all non-zero dct coefficients are in
84991037db265ecdd914a26e056cf69207b4f50924ehkuang  // upper-left 4x4 area, we only need to calculate first 4 rows here.
850ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; ++i) {
851b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct16(input, outptr);
852ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    input += 16;
853ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    outptr += 16;
854ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
855ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
856ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Then transform columns
857ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 16; ++i) {
858ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 16; ++j)
859ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp_in[j] = out[j*16 + i];
860b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct16(temp_in, temp_out);
861ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 16; ++j)
8625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
8635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  + dest[j * stride + i]);
864ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
865ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
866ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
8675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
868f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int i, j;
869f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  int a1;
870ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
871ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  out = dct_const_round_shift(out * cospi_16_64);
872f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  a1 = ROUND_POWER_OF_TWO(out, 6);
873f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  for (j = 0; j < 16; ++j) {
874f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    for (i = 0; i < 16; ++i)
875f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang      dest[i] = clip_pixel(dest[i] + a1);
8765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dest += stride;
877f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang  }
878ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
879ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
880b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic void idct32(const int16_t *input, int16_t *output) {
881ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t step1[32], step2[32];
882ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int temp1, temp2;
883ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
884ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 1
885ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[0] = input[0];
886ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[1] = input[16];
887ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[2] = input[8];
888ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[3] = input[24];
889ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[4] = input[4];
890ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[5] = input[20];
891ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[6] = input[12];
892ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[7] = input[28];
893ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[8] = input[2];
894ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[9] = input[18];
895ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[10] = input[10];
896ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[11] = input[26];
897ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[12] = input[6];
898ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[13] = input[22];
899ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[14] = input[14];
900ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[15] = input[30];
901ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
902ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
903ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
904ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[16] = dct_const_round_shift(temp1);
905ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[31] = dct_const_round_shift(temp2);
906ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
907ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
908ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
909ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[17] = dct_const_round_shift(temp1);
910ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[30] = dct_const_round_shift(temp2);
911ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
912ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
913ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
914ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[18] = dct_const_round_shift(temp1);
915ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[29] = dct_const_round_shift(temp2);
916ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
917ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
918ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
919ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[19] = dct_const_round_shift(temp1);
920ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[28] = dct_const_round_shift(temp2);
921ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
922ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
923ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
924ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[20] = dct_const_round_shift(temp1);
925ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[27] = dct_const_round_shift(temp2);
926ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
927ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
928ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
929ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[21] = dct_const_round_shift(temp1);
930ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[26] = dct_const_round_shift(temp2);
931ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
932ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
933ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
934ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[22] = dct_const_round_shift(temp1);
935ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[25] = dct_const_round_shift(temp2);
936ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
937ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
938ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
939ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[23] = dct_const_round_shift(temp1);
940ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[24] = dct_const_round_shift(temp2);
941ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
942ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 2
943ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[0] = step1[0];
944ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[1] = step1[1];
945ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[2] = step1[2];
946ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[3] = step1[3];
947ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[4] = step1[4];
948ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[5] = step1[5];
949ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[6] = step1[6];
950ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[7] = step1[7];
951ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
952ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
953ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
954ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[8] = dct_const_round_shift(temp1);
955ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[15] = dct_const_round_shift(temp2);
956ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
957ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
958ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
959ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[9] = dct_const_round_shift(temp1);
960ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[14] = dct_const_round_shift(temp2);
961ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
962ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
963ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
964ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[10] = dct_const_round_shift(temp1);
965ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[13] = dct_const_round_shift(temp2);
966ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
967ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
968ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
969ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[11] = dct_const_round_shift(temp1);
970ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[12] = dct_const_round_shift(temp2);
971ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
972ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[16] = step1[16] + step1[17];
973ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[17] = step1[16] - step1[17];
974ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[18] = -step1[18] + step1[19];
975ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[19] = step1[18] + step1[19];
976ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[20] = step1[20] + step1[21];
977ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[21] = step1[20] - step1[21];
978ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[22] = -step1[22] + step1[23];
979ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[23] = step1[22] + step1[23];
980ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[24] = step1[24] + step1[25];
981ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[25] = step1[24] - step1[25];
982ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[26] = -step1[26] + step1[27];
983ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[27] = step1[26] + step1[27];
984ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[28] = step1[28] + step1[29];
985ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[29] = step1[28] - step1[29];
986ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[30] = -step1[30] + step1[31];
987ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[31] = step1[30] + step1[31];
988ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
989ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 3
990ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[0] = step2[0];
991ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[1] = step2[1];
992ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[2] = step2[2];
993ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[3] = step2[3];
994ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
995ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
996ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
997ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[4] = dct_const_round_shift(temp1);
998ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[7] = dct_const_round_shift(temp2);
999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1000ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1001ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[5] = dct_const_round_shift(temp1);
1002ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[6] = dct_const_round_shift(temp2);
1003ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1004ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[8] = step2[8] + step2[9];
1005ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[9] = step2[8] - step2[9];
1006ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[10] = -step2[10] + step2[11];
1007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[11] = step2[10] + step2[11];
1008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[12] = step2[12] + step2[13];
1009ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[13] = step2[12] - step2[13];
1010ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[14] = -step2[14] + step2[15];
1011ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[15] = step2[14] + step2[15];
1012ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1013ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[16] = step2[16];
1014ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[31] = step2[31];
1015ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
1016ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
1017ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[17] = dct_const_round_shift(temp1);
1018ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[30] = dct_const_round_shift(temp2);
1019ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
1020ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
1021ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[18] = dct_const_round_shift(temp1);
1022ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[29] = dct_const_round_shift(temp2);
1023ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[19] = step2[19];
1024ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[20] = step2[20];
1025ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
1026ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
1027ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[21] = dct_const_round_shift(temp1);
1028ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[26] = dct_const_round_shift(temp2);
1029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
1030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
1031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[22] = dct_const_round_shift(temp1);
1032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[25] = dct_const_round_shift(temp2);
1033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[23] = step2[23];
1034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[24] = step2[24];
1035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[27] = step2[27];
1036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[28] = step2[28];
1037ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 4
1039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (step1[0] + step1[1]) * cospi_16_64;
1040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step1[0] - step1[1]) * cospi_16_64;
1041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[0] = dct_const_round_shift(temp1);
1042ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[1] = dct_const_round_shift(temp2);
1043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1044ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1045ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[2] = dct_const_round_shift(temp1);
1046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[3] = dct_const_round_shift(temp2);
1047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[4] = step1[4] + step1[5];
1048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[5] = step1[4] - step1[5];
1049ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[6] = -step1[6] + step1[7];
1050ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[7] = step1[6] + step1[7];
1051ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[8] = step1[8];
1053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[15] = step1[15];
1054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[9] = dct_const_round_shift(temp1);
1057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[14] = dct_const_round_shift(temp2);
1058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[10] = dct_const_round_shift(temp1);
1061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[13] = dct_const_round_shift(temp2);
1062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[11] = step1[11];
1063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[12] = step1[12];
1064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[16] = step1[16] + step1[19];
1066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[17] = step1[17] + step1[18];
1067ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[18] = step1[17] - step1[18];
1068ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[19] = step1[16] - step1[19];
1069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[20] = -step1[20] + step1[23];
1070ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[21] = -step1[21] + step1[22];
1071ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[22] = step1[21] + step1[22];
1072ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[23] = step1[20] + step1[23];
1073ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1074ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[24] = step1[24] + step1[27];
1075ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[25] = step1[25] + step1[26];
1076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[26] = step1[25] - step1[26];
1077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[27] = step1[24] - step1[27];
1078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[28] = -step1[28] + step1[31];
1079ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[29] = -step1[29] + step1[30];
1080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[30] = step1[29] + step1[30];
1081ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[31] = step1[28] + step1[31];
1082ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1083ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 5
1084ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[0] = step2[0] + step2[3];
1085ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[1] = step2[1] + step2[2];
1086ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[2] = step2[1] - step2[2];
1087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[3] = step2[0] - step2[3];
1088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[4] = step2[4];
1089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (step2[6] - step2[5]) * cospi_16_64;
1090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step2[5] + step2[6]) * cospi_16_64;
1091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[5] = dct_const_round_shift(temp1);
1092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[6] = dct_const_round_shift(temp2);
1093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[7] = step2[7];
1094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[8] = step2[8] + step2[11];
1096ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[9] = step2[9] + step2[10];
1097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[10] = step2[9] - step2[10];
1098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[11] = step2[8] - step2[11];
1099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[12] = -step2[12] + step2[15];
1100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[13] = -step2[13] + step2[14];
1101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[14] = step2[13] + step2[14];
1102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[15] = step2[12] + step2[15];
1103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[16] = step2[16];
1105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[17] = step2[17];
1106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[18] = dct_const_round_shift(temp1);
1109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[29] = dct_const_round_shift(temp2);
1110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[19] = dct_const_round_shift(temp1);
1113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[28] = dct_const_round_shift(temp2);
1114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[20] = dct_const_round_shift(temp1);
1117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[27] = dct_const_round_shift(temp2);
1118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[21] = dct_const_round_shift(temp1);
1121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[26] = dct_const_round_shift(temp2);
1122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[22] = step2[22];
1123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[23] = step2[23];
1124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[24] = step2[24];
1125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[25] = step2[25];
1126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[30] = step2[30];
1127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[31] = step2[31];
1128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 6
1130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[0] = step1[0] + step1[7];
1131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[1] = step1[1] + step1[6];
1132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[2] = step1[2] + step1[5];
1133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[3] = step1[3] + step1[4];
1134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[4] = step1[3] - step1[4];
1135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[5] = step1[2] - step1[5];
1136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[6] = step1[1] - step1[6];
1137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[7] = step1[0] - step1[7];
1138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[8] = step1[8];
1139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[9] = step1[9];
1140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step1[10] + step1[13]) * cospi_16_64;
1142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[10] = dct_const_round_shift(temp1);
1143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[13] = dct_const_round_shift(temp2);
1144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step1[11] + step1[12]) * cospi_16_64;
1146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[11] = dct_const_round_shift(temp1);
1147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[12] = dct_const_round_shift(temp2);
1148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[14] = step1[14];
1149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[15] = step1[15];
1150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[16] = step1[16] + step1[23];
1152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[17] = step1[17] + step1[22];
1153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[18] = step1[18] + step1[21];
1154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[19] = step1[19] + step1[20];
1155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[20] = step1[19] - step1[20];
1156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[21] = step1[18] - step1[21];
1157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[22] = step1[17] - step1[22];
1158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[23] = step1[16] - step1[23];
1159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[24] = -step1[24] + step1[31];
1161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[25] = -step1[25] + step1[30];
1162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[26] = -step1[26] + step1[29];
1163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[27] = -step1[27] + step1[28];
1164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[28] = step1[27] + step1[28];
1165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[29] = step1[26] + step1[29];
1166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[30] = step1[25] + step1[30];
1167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step2[31] = step1[24] + step1[31];
1168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 7
1170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[0] = step2[0] + step2[15];
1171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[1] = step2[1] + step2[14];
1172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[2] = step2[2] + step2[13];
1173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[3] = step2[3] + step2[12];
1174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[4] = step2[4] + step2[11];
1175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[5] = step2[5] + step2[10];
1176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[6] = step2[6] + step2[9];
1177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[7] = step2[7] + step2[8];
1178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[8] = step2[7] - step2[8];
1179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[9] = step2[6] - step2[9];
1180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[10] = step2[5] - step2[10];
1181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[11] = step2[4] - step2[11];
1182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[12] = step2[3] - step2[12];
1183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[13] = step2[2] - step2[13];
1184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[14] = step2[1] - step2[14];
1185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[15] = step2[0] - step2[15];
1186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[16] = step2[16];
1188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[17] = step2[17];
1189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[18] = step2[18];
1190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[19] = step2[19];
1191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step2[20] + step2[27]) * cospi_16_64;
1193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[20] = dct_const_round_shift(temp1);
1194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[27] = dct_const_round_shift(temp2);
1195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step2[21] + step2[26]) * cospi_16_64;
1197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[21] = dct_const_round_shift(temp1);
1198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[26] = dct_const_round_shift(temp2);
1199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step2[22] + step2[25]) * cospi_16_64;
1201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[22] = dct_const_round_shift(temp1);
1202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[25] = dct_const_round_shift(temp2);
1203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step2[23] + step2[24]) * cospi_16_64;
1205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[23] = dct_const_round_shift(temp1);
1206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[24] = dct_const_round_shift(temp2);
1207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[28] = step2[28];
1208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[29] = step2[29];
1209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[30] = step2[30];
1210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[31] = step2[31];
1211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // final stage
1213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] = step1[0] + step1[31];
1214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = step1[1] + step1[30];
1215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] = step1[2] + step1[29];
1216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = step1[3] + step1[28];
1217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[4] = step1[4] + step1[27];
1218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[5] = step1[5] + step1[26];
1219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[6] = step1[6] + step1[25];
1220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[7] = step1[7] + step1[24];
1221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[8] = step1[8] + step1[23];
1222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[9] = step1[9] + step1[22];
1223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[10] = step1[10] + step1[21];
1224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[11] = step1[11] + step1[20];
1225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[12] = step1[12] + step1[19];
1226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[13] = step1[13] + step1[18];
1227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[14] = step1[14] + step1[17];
1228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[15] = step1[15] + step1[16];
1229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[16] = step1[15] - step1[16];
1230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[17] = step1[14] - step1[17];
1231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[18] = step1[13] - step1[18];
1232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[19] = step1[12] - step1[19];
1233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[20] = step1[11] - step1[20];
1234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[21] = step1[10] - step1[21];
1235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[22] = step1[9] - step1[22];
1236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[23] = step1[8] - step1[23];
1237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[24] = step1[7] - step1[24];
1238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[25] = step1[6] - step1[25];
1239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[26] = step1[5] - step1[26];
1240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[27] = step1[4] - step1[27];
1241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[28] = step1[3] - step1[28];
1242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[29] = step1[2] - step1[29];
1243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[30] = step1[1] - step1[30];
1244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[31] = step1[0] - step1[31];
1245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
12475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
1248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t out[32 * 32];
1249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *outptr = out;
1250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
1251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t temp_in[32], temp_out[32];
1252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows
1254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 32; ++i) {
12555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    int16_t zero_coeff[16];
12565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (j = 0; j < 16; ++j)
12575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
12585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (j = 0; j < 8; ++j)
12595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
12605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (j = 0; j < 4; ++j)
12615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
12625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (j = 0; j < 2; ++j)
12635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
12645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    if (zero_coeff[0] | zero_coeff[1])
1266b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian      idct32(input, outptr);
12675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    else
12685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      vpx_memset(outptr, 0, sizeof(int16_t) * 32);
12695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    input += 32;
12705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    outptr += 32;
12715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
12725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // Columns
12745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 0; i < 32; ++i) {
12755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (j = 0; j < 32; ++j)
12765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      temp_in[j] = out[j * 32 + i];
1277b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct32(temp_in, temp_out);
12785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (j = 0; j < 32; ++j)
12795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1280b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                        + dest[j * stride + i]);
12815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
12825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
12835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
12855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int16_t out[32 * 32] = {0};
12865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int16_t *outptr = out;
12875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int i, j;
12885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int16_t temp_in[32], temp_out[32];
12895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // Rows
12915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // only upper-left 8x8 has non-zero coeff
12925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 0; i < 8; ++i) {
1293b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct32(input, outptr);
1294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    input += 32;
1295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    outptr += 32;
1296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns
1299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 32; ++i) {
1300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 32; ++j)
1301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp_in[j] = out[j * 32 + i];
1302b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    idct32(temp_in, temp_out);
1303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 32; ++j)
13045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
13055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  + dest[j * stride + i]);
1306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
13095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
13105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int i, j;
13115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int a1;
13125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
1314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  out = dct_const_round_shift(out * cospi_16_64);
13155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  a1 = ROUND_POWER_OF_TWO(out, 6);
13165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
13175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (j = 0; j < 32; ++j) {
13185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (i = 0; i < 32; ++i)
13195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dest[i] = clip_pixel(dest[i] + a1);
13205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dest += stride;
13215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
13225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
13235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
13245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang// idct
13255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
13265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (eob > 1)
13275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_idct4x4_16_add(input, dest, stride);
13285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  else
13295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_idct4x4_1_add(input, dest, stride);
13305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
13315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
13325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
13335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
13345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (eob > 1)
13355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_iwht4x4_16_add(input, dest, stride);
13365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  else
13375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_iwht4x4_1_add(input, dest, stride);
13385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
13395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
13405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
13415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // If dc is 1, then input[0] is the reconstructed value, do not need
13425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
13435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
13445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // The calculation can be simplified if there are not many non-zero dct
13455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // coefficients. Use eobs to decide what to do.
13465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
13475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // Combine that with code here.
1348b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  if (eob == 1)
1349b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // DC only DCT coefficient
1350b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    vp9_idct8x8_1_add(input, dest, stride);
1351b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  else if (eob <= 10)
1352b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    vp9_idct8x8_10_add(input, dest, stride);
1353b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  else
1354b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    vp9_idct8x8_64_add(input, dest, stride);
13555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
13565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
13575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
13585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                       int eob) {
13595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* The calculation can be simplified if there are not many non-zero dct
13605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang   * coefficients. Use eobs to separate different cases. */
1361b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  if (eob == 1)
1362b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    /* DC only DCT coefficient. */
1363b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    vp9_idct16x16_1_add(input, dest, stride);
1364b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  else if (eob <= 10)
1365b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    vp9_idct16x16_10_add(input, dest, stride);
1366b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  else
1367b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    vp9_idct16x16_256_add(input, dest, stride);
13685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
13695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
13705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
13715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                       int eob) {
1372b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  if (eob == 1)
1373b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    vp9_idct32x32_1_add(input, dest, stride);
1374b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  else if (eob <= 34)
1375b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    // non-zero coeff only in upper-left 8x8
1376b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    vp9_idct32x32_34_add(input, dest, stride);
1377b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  else
1378b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    vp9_idct32x32_1024_add(input, dest, stride);
13795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
13805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
13815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang// iht
13825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
13835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    int stride, int eob) {
13845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (tx_type == DCT_DCT)
13855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_idct4x4_add(input, dest, stride, eob);
13865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  else
13875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_iht4x4_16_add(input, dest, stride, tx_type);
13885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
13895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
13905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
13915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    int stride, int eob) {
13925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (tx_type == DCT_DCT) {
13935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_idct8x8_add(input, dest, stride, eob);
13945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else {
1395b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    vp9_iht8x8_64_add(input, dest, stride, tx_type);
13965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
13975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
13985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
13995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
14005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                      int stride, int eob) {
14015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (tx_type == DCT_DCT) {
14025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_idct16x16_add(input, dest, stride, eob);
14035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else {
1404b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian    vp9_iht16x16_256_add(input, dest, stride, tx_type);
14055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
1406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1407