1233d2500723e5594f3e7c70896ffeeef32b9c950ywan/*
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan *
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan */
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include <assert.h>
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include <math.h>
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "./vpx_config.h"
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "./vp9_rtcd.h"
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_systemdependent.h"
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_blockd.h"
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_common.h"
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_idct.h"
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan
21233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan   0.5 shifts per pixel. */
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i;
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t output[16];
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int a1, b1, c1, d1, e1;
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const int16_t *ip = input;
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *op = output;
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 4; i++) {
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan    a1 = ip[0] >> UNIT_QUANT_SHIFT;
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan    c1 = ip[1] >> UNIT_QUANT_SHIFT;
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d1 = ip[2] >> UNIT_QUANT_SHIFT;
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b1 = ip[3] >> UNIT_QUANT_SHIFT;
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan    a1 += c1;
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d1 -= b1;
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan    e1 = (a1 - d1) >> 1;
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b1 = e1 - b1;
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan    c1 = e1 - c1;
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan    a1 -= b1;
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d1 += c1;
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan    op[0] = a1;
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan    op[1] = b1;
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan    op[2] = c1;
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    op[3] = d1;
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ip += 4;
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan    op += 4;
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan  ip = output;
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 4; i++) {
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan    a1 = ip[4 * 0];
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan    c1 = ip[4 * 1];
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d1 = ip[4 * 2];
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b1 = ip[4 * 3];
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan    a1 += c1;
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d1 -= b1;
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan    e1 = (a1 - d1) >> 1;
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b1 = e1 - b1;
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan    c1 = e1 - c1;
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan    a1 -= b1;
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d1 += c1;
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ip++;
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest++;
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan
73233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i;
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int a1, e1;
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t tmp[4];
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const int16_t *ip = in;
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *op = tmp;
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan  a1 = ip[0] >> UNIT_QUANT_SHIFT;
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan  e1 = a1 >> 1;
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan  a1 -= e1;
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan  op[0] = a1;
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan  op[1] = op[2] = op[3] = e1;
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan  ip = tmp;
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 4; i++) {
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan    e1 = ip[0] >> 1;
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan    a1 = ip[0] - e1;
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ip++;
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest++;
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan
99233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void idct4(const int16_t *input, int16_t *output) {
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t step[4];
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int temp1, temp2;
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 1
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (input[0] + input[2]) * cospi_16_64;
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (input[0] - input[2]) * cospi_16_64;
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step[0] = dct_const_round_shift(temp1);
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step[1] = dct_const_round_shift(temp2);
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step[2] = dct_const_round_shift(temp1);
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step[3] = dct_const_round_shift(temp2);
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 2
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[0] = step[0] + step[3];
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[1] = step[1] + step[2];
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[2] = step[1] - step[2];
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[3] = step[0] - step[3];
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan
119233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out[4 * 4];
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *outptr = out;
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t temp_in[4], temp_out[4];
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Rows
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 4; ++i) {
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct4(input, outptr);
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan    input += 4;
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan    outptr += 4;
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Columns
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 4; ++i) {
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 4; ++j)
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan      temp_in[j] = out[j * 4 + i];
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct4(temp_in, temp_out);
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 4; ++j)
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  + dest[j * stride + i]);
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan
143233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i;
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int a1;
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan  out = dct_const_round_shift(out * cospi_16_64);
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan  a1 = ROUND_POWER_OF_TWO(out, 4);
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 4; i++) {
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[0] = clip_pixel(dest[0] + a1);
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[1] = clip_pixel(dest[1] + a1);
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[2] = clip_pixel(dest[2] + a1);
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest[3] = clip_pixel(dest[3] + a1);
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest += dest_stride;
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan
159233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void idct8(const int16_t *input, int16_t *output) {
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t step1[8], step2[8];
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int temp1, temp2;
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 1
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[0] = input[0];
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[2] = input[4];
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[1] = input[2];
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[3] = input[6];
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[4] = dct_const_round_shift(temp1);
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[7] = dct_const_round_shift(temp2);
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[5] = dct_const_round_shift(temp1);
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[6] = dct_const_round_shift(temp2);
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 2 & stage 3 - even half
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan  idct4(step1, step1);
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 2 - odd half
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[4] = step1[4] + step1[5];
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[5] = step1[4] - step1[5];
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[6] = -step1[6] + step1[7];
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[7] = step1[6] + step1[7];
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 3 -odd half
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[4] = step2[4];
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (step2[6] - step2[5]) * cospi_16_64;
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step2[5] + step2[6]) * cospi_16_64;
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[5] = dct_const_round_shift(temp1);
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[6] = dct_const_round_shift(temp2);
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[7] = step2[7];
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 4
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[0] = step1[0] + step1[7];
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[1] = step1[1] + step1[6];
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[2] = step1[2] + step1[5];
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[3] = step1[3] + step1[4];
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[4] = step1[3] - step1[4];
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[5] = step1[2] - step1[5];
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[6] = step1[1] - step1[6];
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[7] = step1[0] - step1[7];
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan
204233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out[8 * 8];
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *outptr = out;
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t temp_in[8], temp_out[8];
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // First transform rows
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 8; ++i) {
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct8(input, outptr);
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan    input += 8;
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    outptr += 8;
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Then transform columns
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 8; ++i) {
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 8; ++j)
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan      temp_in[j] = out[j * 8 + i];
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct8(temp_in, temp_out);
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 8; ++j)
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  + dest[j * stride + i]);
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan
228233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int a1;
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan  out = dct_const_round_shift(out * cospi_16_64);
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan  a1 = ROUND_POWER_OF_TWO(out, 5);
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (j = 0; j < 8; ++j) {
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (i = 0; i < 8; ++i)
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[i] = clip_pixel(dest[i] + a1);
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest += stride;
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan
241233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void iadst4(const int16_t *input, int16_t *output) {
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int s0, s1, s2, s3, s4, s5, s6, s7;
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x0 = input[0];
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x1 = input[1];
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x2 = input[2];
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x3 = input[3];
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (!(x0 | x1 | x2 | x3)) {
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    output[0] = output[1] = output[2] = output[3] = 0;
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return;
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s0 = sinpi_1_9 * x0;
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s1 = sinpi_2_9 * x0;
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s2 = sinpi_3_9 * x1;
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s3 = sinpi_4_9 * x2;
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s4 = sinpi_1_9 * x2;
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s5 = sinpi_2_9 * x3;
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s6 = sinpi_4_9 * x3;
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s7 = x0 - x2 + x3;
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x0 = s0 + s3 + s5;
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x1 = s1 - s4 - s6;
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x2 = sinpi_3_9 * s7;
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x3 = s2;
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s0 = x0 + x3;
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s1 = x1 + x3;
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s2 = x2;
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s3 = x0 + x1 - x3;
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // 1-D transform scaling factor is sqrt(2).
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // + 1b (addition) = 29b.
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Hence the output bit depth is 15b.
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[0] = dct_const_round_shift(s0);
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[1] = dct_const_round_shift(s1);
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[2] = dct_const_round_shift(s2);
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[3] = dct_const_round_shift(s3);
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan
283233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         int tx_type) {
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const transform_2d IHT_4[] = {
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan    { idct4, idct4  },  // DCT_DCT  = 0
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan    { iadst4, idct4  },   // ADST_DCT = 1
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan    { idct4, iadst4 },    // DCT_ADST = 2
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan    { iadst4, iadst4 }      // ADST_ADST = 3
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan  };
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out[4 * 4];
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *outptr = out;
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t temp_in[4], temp_out[4];
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // inverse transform row vectors
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 4; ++i) {
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IHT_4[tx_type].rows(input, outptr);
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan    input  += 4;
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan    outptr += 4;
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // inverse transform column vectors
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 4; ++i) {
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 4; ++j)
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan      temp_in[j] = out[j * 4 + i];
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IHT_4[tx_type].cols(temp_in, temp_out);
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 4; ++j)
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  + dest[j * stride + i]);
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
314233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void iadst8(const int16_t *input, int16_t *output) {
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int s0, s1, s2, s3, s4, s5, s6, s7;
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x0 = input[7];
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x1 = input[0];
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x2 = input[5];
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x3 = input[2];
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x4 = input[3];
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x5 = input[4];
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x6 = input[1];
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x7 = input[6];
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    output[0] = output[1] = output[2] = output[3] = output[4]
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan              = output[5] = output[6] = output[7] = 0;
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return;
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 1
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x0 = dct_const_round_shift(s0 + s4);
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x1 = dct_const_round_shift(s1 + s5);
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x2 = dct_const_round_shift(s2 + s6);
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x3 = dct_const_round_shift(s3 + s7);
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x4 = dct_const_round_shift(s0 - s4);
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x5 = dct_const_round_shift(s1 - s5);
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x6 = dct_const_round_shift(s2 - s6);
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x7 = dct_const_round_shift(s3 - s7);
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 2
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s0 = x0;
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s1 = x1;
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s2 = x2;
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s3 = x3;
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x0 = s0 + s2;
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x1 = s1 + s3;
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x2 = s0 - s2;
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x3 = s1 - s3;
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x4 = dct_const_round_shift(s4 + s6);
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x5 = dct_const_round_shift(s5 + s7);
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x6 = dct_const_round_shift(s4 - s6);
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x7 = dct_const_round_shift(s5 - s7);
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 3
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s2 = cospi_16_64 * (x2 + x3);
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s3 = cospi_16_64 * (x2 - x3);
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s6 = cospi_16_64 * (x6 + x7);
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s7 = cospi_16_64 * (x6 - x7);
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x2 = dct_const_round_shift(s2);
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x3 = dct_const_round_shift(s3);
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x6 = dct_const_round_shift(s6);
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x7 = dct_const_round_shift(s7);
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[0] =  x0;
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[1] = -x4;
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[2] =  x6;
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[3] = -x2;
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[4] =  x3;
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[5] = -x7;
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[6] =  x5;
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[7] = -x1;
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan
391233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic const transform_2d IHT_8[] = {
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan  { idct8,  idct8  },  // DCT_DCT  = 0
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan  { iadst8, idct8  },  // ADST_DCT = 1
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan  { idct8,  iadst8 },  // DCT_ADST = 2
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan  { iadst8, iadst8 }   // ADST_ADST = 3
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan};
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan
398233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         int tx_type) {
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out[8 * 8];
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *outptr = out;
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t temp_in[8], temp_out[8];
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const transform_2d ht = IHT_8[tx_type];
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // inverse transform row vectors
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 8; ++i) {
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ht.rows(input, outptr);
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan    input += 8;
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan    outptr += 8;
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // inverse transform column vectors
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 8; ++i) {
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 8; ++j)
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan      temp_in[j] = out[j * 8 + i];
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ht.cols(temp_in, temp_out);
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 8; ++j)
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  + dest[j * stride + i]);
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan
424233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out[8 * 8] = { 0 };
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *outptr = out;
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t temp_in[8], temp_out[8];
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // First transform rows
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // only first 4 row has non-zero coefs
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 4; ++i) {
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct8(input, outptr);
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan    input += 8;
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan    outptr += 8;
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Then transform columns
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 8; ++i) {
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 8; ++j)
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan      temp_in[j] = out[j * 8 + i];
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct8(temp_in, temp_out);
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 8; ++j)
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  + dest[j * stride + i]);
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan
449233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void idct16(const int16_t *input, int16_t *output) {
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t step1[16], step2[16];
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int temp1, temp2;
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 1
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[0] = input[0/2];
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[1] = input[16/2];
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[2] = input[8/2];
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[3] = input[24/2];
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[4] = input[4/2];
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[5] = input[20/2];
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[6] = input[12/2];
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[7] = input[28/2];
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[8] = input[2/2];
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[9] = input[18/2];
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[10] = input[10/2];
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[11] = input[26/2];
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[12] = input[6/2];
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[13] = input[22/2];
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[14] = input[14/2];
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[15] = input[30/2];
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 2
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[0] = step1[0];
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[1] = step1[1];
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[2] = step1[2];
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[3] = step1[3];
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[4] = step1[4];
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[5] = step1[5];
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[6] = step1[6];
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[7] = step1[7];
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[8] = dct_const_round_shift(temp1);
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[15] = dct_const_round_shift(temp2);
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[9] = dct_const_round_shift(temp1);
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[14] = dct_const_round_shift(temp2);
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[10] = dct_const_round_shift(temp1);
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[13] = dct_const_round_shift(temp2);
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[11] = dct_const_round_shift(temp1);
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[12] = dct_const_round_shift(temp2);
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 3
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[0] = step2[0];
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[1] = step2[1];
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[2] = step2[2];
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[3] = step2[3];
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[4] = dct_const_round_shift(temp1);
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[7] = dct_const_round_shift(temp2);
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[5] = dct_const_round_shift(temp1);
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[6] = dct_const_round_shift(temp2);
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[8] = step2[8] + step2[9];
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[9] = step2[8] - step2[9];
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[10] = -step2[10] + step2[11];
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[11] = step2[10] + step2[11];
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[12] = step2[12] + step2[13];
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[13] = step2[12] - step2[13];
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[14] = -step2[14] + step2[15];
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[15] = step2[14] + step2[15];
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 4
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (step1[0] + step1[1]) * cospi_16_64;
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step1[0] - step1[1]) * cospi_16_64;
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[0] = dct_const_round_shift(temp1);
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[1] = dct_const_round_shift(temp2);
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[2] = dct_const_round_shift(temp1);
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[3] = dct_const_round_shift(temp2);
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[4] = step1[4] + step1[5];
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[5] = step1[4] - step1[5];
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[6] = -step1[6] + step1[7];
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[7] = step1[6] + step1[7];
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[8] = step1[8];
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[15] = step1[15];
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[9] = dct_const_round_shift(temp1);
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[14] = dct_const_round_shift(temp2);
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[10] = dct_const_round_shift(temp1);
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[13] = dct_const_round_shift(temp2);
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[11] = step1[11];
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[12] = step1[12];
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 5
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[0] = step2[0] + step2[3];
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[1] = step2[1] + step2[2];
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[2] = step2[1] - step2[2];
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[3] = step2[0] - step2[3];
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[4] = step2[4];
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (step2[6] - step2[5]) * cospi_16_64;
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step2[5] + step2[6]) * cospi_16_64;
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[5] = dct_const_round_shift(temp1);
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[6] = dct_const_round_shift(temp2);
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[7] = step2[7];
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[8] = step2[8] + step2[11];
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[9] = step2[9] + step2[10];
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[10] = step2[9] - step2[10];
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[11] = step2[8] - step2[11];
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[12] = -step2[12] + step2[15];
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[13] = -step2[13] + step2[14];
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[14] = step2[13] + step2[14];
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[15] = step2[12] + step2[15];
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 6
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[0] = step1[0] + step1[7];
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[1] = step1[1] + step1[6];
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[2] = step1[2] + step1[5];
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[3] = step1[3] + step1[4];
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[4] = step1[3] - step1[4];
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[5] = step1[2] - step1[5];
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[6] = step1[1] - step1[6];
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[7] = step1[0] - step1[7];
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[8] = step1[8];
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[9] = step1[9];
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step1[10] + step1[13]) * cospi_16_64;
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[10] = dct_const_round_shift(temp1);
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[13] = dct_const_round_shift(temp2);
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step1[11] + step1[12]) * cospi_16_64;
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[11] = dct_const_round_shift(temp1);
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[12] = dct_const_round_shift(temp2);
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[14] = step1[14];
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[15] = step1[15];
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 7
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[0] = step2[0] + step2[15];
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[1] = step2[1] + step2[14];
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[2] = step2[2] + step2[13];
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[3] = step2[3] + step2[12];
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[4] = step2[4] + step2[11];
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[5] = step2[5] + step2[10];
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[6] = step2[6] + step2[9];
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[7] = step2[7] + step2[8];
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[8] = step2[7] - step2[8];
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[9] = step2[6] - step2[9];
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[10] = step2[5] - step2[10];
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[11] = step2[4] - step2[11];
608233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[12] = step2[3] - step2[12];
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[13] = step2[2] - step2[13];
610233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[14] = step2[1] - step2[14];
611233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[15] = step2[0] - step2[15];
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan
614233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out[16 * 16];
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *outptr = out;
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t temp_in[16], temp_out[16];
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // First transform rows
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 16; ++i) {
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct16(input, outptr);
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan    input += 16;
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan    outptr += 16;
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
626233d2500723e5594f3e7c70896ffeeef32b9c950ywan
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Then transform columns
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 16; ++i) {
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 16; ++j)
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan      temp_in[j] = out[j * 16 + i];
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct16(temp_in, temp_out);
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 16; ++j)
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  + dest[j * stride + i]);
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan
638233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void iadst16(const int16_t *input, int16_t *output) {
639233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan
641233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x0 = input[15];
642233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x1 = input[0];
643233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x2 = input[13];
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x3 = input[2];
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x4 = input[11];
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x5 = input[4];
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x6 = input[9];
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x7 = input[6];
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x8 = input[7];
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x9 = input[8];
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x10 = input[5];
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x11 = input[10];
653233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x12 = input[3];
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x13 = input[12];
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x14 = input[1];
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x15 = input[14];
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan    output[0] = output[1] = output[2] = output[3] = output[4]
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan              = output[5] = output[6] = output[7] = output[8]
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan              = output[9] = output[10] = output[11] = output[12]
663233d2500723e5594f3e7c70896ffeeef32b9c950ywan              = output[13] = output[14] = output[15] = 0;
664233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return;
665233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 1
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
670233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
671233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x0 = dct_const_round_shift(s0 + s8);
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x1 = dct_const_round_shift(s1 + s9);
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x2 = dct_const_round_shift(s2 + s10);
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x3 = dct_const_round_shift(s3 + s11);
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x4 = dct_const_round_shift(s4 + s12);
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x5 = dct_const_round_shift(s5 + s13);
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x6 = dct_const_round_shift(s6 + s14);
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x7 = dct_const_round_shift(s7 + s15);
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x8  = dct_const_round_shift(s0 - s8);
694233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x9  = dct_const_round_shift(s1 - s9);
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x10 = dct_const_round_shift(s2 - s10);
696233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x11 = dct_const_round_shift(s3 - s11);
697233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x12 = dct_const_round_shift(s4 - s12);
698233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x13 = dct_const_round_shift(s5 - s13);
699233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x14 = dct_const_round_shift(s6 - s14);
700233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x15 = dct_const_round_shift(s7 - s15);
701233d2500723e5594f3e7c70896ffeeef32b9c950ywan
702233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 2
703233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s0 = x0;
704233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s1 = x1;
705233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s2 = x2;
706233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s3 = x3;
707233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s4 = x4;
708233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s5 = x5;
709233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s6 = x6;
710233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s7 = x7;
711233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
712233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
713233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
714233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
715233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
716233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
717233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
718233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
719233d2500723e5594f3e7c70896ffeeef32b9c950ywan
720233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x0 = s0 + s4;
721233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x1 = s1 + s5;
722233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x2 = s2 + s6;
723233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x3 = s3 + s7;
724233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x4 = s0 - s4;
725233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x5 = s1 - s5;
726233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x6 = s2 - s6;
727233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x7 = s3 - s7;
728233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x8 = dct_const_round_shift(s8 + s12);
729233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x9 = dct_const_round_shift(s9 + s13);
730233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x10 = dct_const_round_shift(s10 + s14);
731233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x11 = dct_const_round_shift(s11 + s15);
732233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x12 = dct_const_round_shift(s8 - s12);
733233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x13 = dct_const_round_shift(s9 - s13);
734233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x14 = dct_const_round_shift(s10 - s14);
735233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x15 = dct_const_round_shift(s11 - s15);
736233d2500723e5594f3e7c70896ffeeef32b9c950ywan
737233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 3
738233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s0 = x0;
739233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s1 = x1;
740233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s2 = x2;
741233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s3 = x3;
742233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
743233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
744233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
745233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
746233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s8 = x8;
747233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s9 = x9;
748233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s10 = x10;
749233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s11 = x11;
750233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
751233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
752233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
753233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
754233d2500723e5594f3e7c70896ffeeef32b9c950ywan
755233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x0 = s0 + s2;
756233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x1 = s1 + s3;
757233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x2 = s0 - s2;
758233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x3 = s1 - s3;
759233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x4 = dct_const_round_shift(s4 + s6);
760233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x5 = dct_const_round_shift(s5 + s7);
761233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x6 = dct_const_round_shift(s4 - s6);
762233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x7 = dct_const_round_shift(s5 - s7);
763233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x8 = s8 + s10;
764233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x9 = s9 + s11;
765233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x10 = s8 - s10;
766233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x11 = s9 - s11;
767233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x12 = dct_const_round_shift(s12 + s14);
768233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x13 = dct_const_round_shift(s13 + s15);
769233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x14 = dct_const_round_shift(s12 - s14);
770233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x15 = dct_const_round_shift(s13 - s15);
771233d2500723e5594f3e7c70896ffeeef32b9c950ywan
772233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 4
773233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s2 = (- cospi_16_64) * (x2 + x3);
774233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s3 = cospi_16_64 * (x2 - x3);
775233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s6 = cospi_16_64 * (x6 + x7);
776233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s7 = cospi_16_64 * (- x6 + x7);
777233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s10 = cospi_16_64 * (x10 + x11);
778233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s11 = cospi_16_64 * (- x10 + x11);
779233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s14 = (- cospi_16_64) * (x14 + x15);
780233d2500723e5594f3e7c70896ffeeef32b9c950ywan  s15 = cospi_16_64 * (x14 - x15);
781233d2500723e5594f3e7c70896ffeeef32b9c950ywan
782233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x2 = dct_const_round_shift(s2);
783233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x3 = dct_const_round_shift(s3);
784233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x6 = dct_const_round_shift(s6);
785233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x7 = dct_const_round_shift(s7);
786233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x10 = dct_const_round_shift(s10);
787233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x11 = dct_const_round_shift(s11);
788233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x14 = dct_const_round_shift(s14);
789233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x15 = dct_const_round_shift(s15);
790233d2500723e5594f3e7c70896ffeeef32b9c950ywan
791233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[0] =  x0;
792233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[1] = -x8;
793233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[2] =  x12;
794233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[3] = -x4;
795233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[4] =  x6;
796233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[5] =  x14;
797233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[6] =  x10;
798233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[7] =  x2;
799233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[8] =  x3;
800233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[9] =  x11;
801233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[10] =  x15;
802233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[11] =  x7;
803233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[12] =  x5;
804233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[13] = -x13;
805233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[14] =  x9;
806233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[15] = -x1;
807233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
808233d2500723e5594f3e7c70896ffeeef32b9c950ywan
809233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic const transform_2d IHT_16[] = {
810233d2500723e5594f3e7c70896ffeeef32b9c950ywan  { idct16,  idct16  },  // DCT_DCT  = 0
811233d2500723e5594f3e7c70896ffeeef32b9c950ywan  { iadst16, idct16  },  // ADST_DCT = 1
812233d2500723e5594f3e7c70896ffeeef32b9c950ywan  { idct16,  iadst16 },  // DCT_ADST = 2
813233d2500723e5594f3e7c70896ffeeef32b9c950ywan  { iadst16, iadst16 }   // ADST_ADST = 3
814233d2500723e5594f3e7c70896ffeeef32b9c950ywan};
815233d2500723e5594f3e7c70896ffeeef32b9c950ywan
816233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
817233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            int tx_type) {
818233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
819233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out[16 * 16];
820233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *outptr = out;
821233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t temp_in[16], temp_out[16];
822233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const transform_2d ht = IHT_16[tx_type];
823233d2500723e5594f3e7c70896ffeeef32b9c950ywan
824233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Rows
825233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 16; ++i) {
826233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ht.rows(input, outptr);
827233d2500723e5594f3e7c70896ffeeef32b9c950ywan    input += 16;
828233d2500723e5594f3e7c70896ffeeef32b9c950ywan    outptr += 16;
829233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
830233d2500723e5594f3e7c70896ffeeef32b9c950ywan
831233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Columns
832233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 16; ++i) {
833233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 16; ++j)
834233d2500723e5594f3e7c70896ffeeef32b9c950ywan      temp_in[j] = out[j * 16 + i];
835233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ht.cols(temp_in, temp_out);
836233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 16; ++j)
837233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
838233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        + dest[j * stride + i]);
839233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
840233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
841233d2500723e5594f3e7c70896ffeeef32b9c950ywan
842233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
843233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out[16 * 16] = { 0 };
844233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *outptr = out;
845233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
846233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t temp_in[16], temp_out[16];
847233d2500723e5594f3e7c70896ffeeef32b9c950ywan
848233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // First transform rows. Since all non-zero dct coefficients are in
849233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // upper-left 4x4 area, we only need to calculate first 4 rows here.
850233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 4; ++i) {
851233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct16(input, outptr);
852233d2500723e5594f3e7c70896ffeeef32b9c950ywan    input += 16;
853233d2500723e5594f3e7c70896ffeeef32b9c950ywan    outptr += 16;
854233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
855233d2500723e5594f3e7c70896ffeeef32b9c950ywan
856233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Then transform columns
857233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 16; ++i) {
858233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 16; ++j)
859233d2500723e5594f3e7c70896ffeeef32b9c950ywan      temp_in[j] = out[j*16 + i];
860233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct16(temp_in, temp_out);
861233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 16; ++j)
862233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
863233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  + dest[j * stride + i]);
864233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
865233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
866233d2500723e5594f3e7c70896ffeeef32b9c950ywan
867233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
868233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
869233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int a1;
870233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
871233d2500723e5594f3e7c70896ffeeef32b9c950ywan  out = dct_const_round_shift(out * cospi_16_64);
872233d2500723e5594f3e7c70896ffeeef32b9c950ywan  a1 = ROUND_POWER_OF_TWO(out, 6);
873233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (j = 0; j < 16; ++j) {
874233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (i = 0; i < 16; ++i)
875233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[i] = clip_pixel(dest[i] + a1);
876233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest += stride;
877233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
878233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
879233d2500723e5594f3e7c70896ffeeef32b9c950ywan
880233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void idct32(const int16_t *input, int16_t *output) {
881233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t step1[32], step2[32];
882233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int temp1, temp2;
883233d2500723e5594f3e7c70896ffeeef32b9c950ywan
884233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 1
885233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[0] = input[0];
886233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[1] = input[16];
887233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[2] = input[8];
888233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[3] = input[24];
889233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[4] = input[4];
890233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[5] = input[20];
891233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[6] = input[12];
892233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[7] = input[28];
893233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[8] = input[2];
894233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[9] = input[18];
895233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[10] = input[10];
896233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[11] = input[26];
897233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[12] = input[6];
898233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[13] = input[22];
899233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[14] = input[14];
900233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[15] = input[30];
901233d2500723e5594f3e7c70896ffeeef32b9c950ywan
902233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
903233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
904233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[16] = dct_const_round_shift(temp1);
905233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[31] = dct_const_round_shift(temp2);
906233d2500723e5594f3e7c70896ffeeef32b9c950ywan
907233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
908233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
909233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[17] = dct_const_round_shift(temp1);
910233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[30] = dct_const_round_shift(temp2);
911233d2500723e5594f3e7c70896ffeeef32b9c950ywan
912233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
913233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
914233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[18] = dct_const_round_shift(temp1);
915233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[29] = dct_const_round_shift(temp2);
916233d2500723e5594f3e7c70896ffeeef32b9c950ywan
917233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
918233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
919233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[19] = dct_const_round_shift(temp1);
920233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[28] = dct_const_round_shift(temp2);
921233d2500723e5594f3e7c70896ffeeef32b9c950ywan
922233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
923233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
924233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[20] = dct_const_round_shift(temp1);
925233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[27] = dct_const_round_shift(temp2);
926233d2500723e5594f3e7c70896ffeeef32b9c950ywan
927233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
928233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
929233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[21] = dct_const_round_shift(temp1);
930233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[26] = dct_const_round_shift(temp2);
931233d2500723e5594f3e7c70896ffeeef32b9c950ywan
932233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
933233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
934233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[22] = dct_const_round_shift(temp1);
935233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[25] = dct_const_round_shift(temp2);
936233d2500723e5594f3e7c70896ffeeef32b9c950ywan
937233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
938233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
939233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[23] = dct_const_round_shift(temp1);
940233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[24] = dct_const_round_shift(temp2);
941233d2500723e5594f3e7c70896ffeeef32b9c950ywan
942233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 2
943233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[0] = step1[0];
944233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[1] = step1[1];
945233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[2] = step1[2];
946233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[3] = step1[3];
947233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[4] = step1[4];
948233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[5] = step1[5];
949233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[6] = step1[6];
950233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[7] = step1[7];
951233d2500723e5594f3e7c70896ffeeef32b9c950ywan
952233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
953233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
954233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[8] = dct_const_round_shift(temp1);
955233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[15] = dct_const_round_shift(temp2);
956233d2500723e5594f3e7c70896ffeeef32b9c950ywan
957233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
958233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
959233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[9] = dct_const_round_shift(temp1);
960233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[14] = dct_const_round_shift(temp2);
961233d2500723e5594f3e7c70896ffeeef32b9c950ywan
962233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
963233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
964233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[10] = dct_const_round_shift(temp1);
965233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[13] = dct_const_round_shift(temp2);
966233d2500723e5594f3e7c70896ffeeef32b9c950ywan
967233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
968233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
969233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[11] = dct_const_round_shift(temp1);
970233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[12] = dct_const_round_shift(temp2);
971233d2500723e5594f3e7c70896ffeeef32b9c950ywan
972233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[16] = step1[16] + step1[17];
973233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[17] = step1[16] - step1[17];
974233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[18] = -step1[18] + step1[19];
975233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[19] = step1[18] + step1[19];
976233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[20] = step1[20] + step1[21];
977233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[21] = step1[20] - step1[21];
978233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[22] = -step1[22] + step1[23];
979233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[23] = step1[22] + step1[23];
980233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[24] = step1[24] + step1[25];
981233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[25] = step1[24] - step1[25];
982233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[26] = -step1[26] + step1[27];
983233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[27] = step1[26] + step1[27];
984233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[28] = step1[28] + step1[29];
985233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[29] = step1[28] - step1[29];
986233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[30] = -step1[30] + step1[31];
987233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[31] = step1[30] + step1[31];
988233d2500723e5594f3e7c70896ffeeef32b9c950ywan
989233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 3
990233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[0] = step2[0];
991233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[1] = step2[1];
992233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[2] = step2[2];
993233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[3] = step2[3];
994233d2500723e5594f3e7c70896ffeeef32b9c950ywan
995233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
996233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
997233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[4] = dct_const_round_shift(temp1);
998233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[7] = dct_const_round_shift(temp2);
999233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1000233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1001233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[5] = dct_const_round_shift(temp1);
1002233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[6] = dct_const_round_shift(temp2);
1003233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1004233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[8] = step2[8] + step2[9];
1005233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[9] = step2[8] - step2[9];
1006233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[10] = -step2[10] + step2[11];
1007233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[11] = step2[10] + step2[11];
1008233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[12] = step2[12] + step2[13];
1009233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[13] = step2[12] - step2[13];
1010233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[14] = -step2[14] + step2[15];
1011233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[15] = step2[14] + step2[15];
1012233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1013233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[16] = step2[16];
1014233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[31] = step2[31];
1015233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
1016233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
1017233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[17] = dct_const_round_shift(temp1);
1018233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[30] = dct_const_round_shift(temp2);
1019233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
1020233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
1021233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[18] = dct_const_round_shift(temp1);
1022233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[29] = dct_const_round_shift(temp2);
1023233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[19] = step2[19];
1024233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[20] = step2[20];
1025233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
1026233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
1027233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[21] = dct_const_round_shift(temp1);
1028233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[26] = dct_const_round_shift(temp2);
1029233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
1030233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
1031233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[22] = dct_const_round_shift(temp1);
1032233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[25] = dct_const_round_shift(temp2);
1033233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[23] = step2[23];
1034233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[24] = step2[24];
1035233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[27] = step2[27];
1036233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[28] = step2[28];
1037233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1038233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 4
1039233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (step1[0] + step1[1]) * cospi_16_64;
1040233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step1[0] - step1[1]) * cospi_16_64;
1041233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[0] = dct_const_round_shift(temp1);
1042233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[1] = dct_const_round_shift(temp2);
1043233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1044233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1045233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[2] = dct_const_round_shift(temp1);
1046233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[3] = dct_const_round_shift(temp2);
1047233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[4] = step1[4] + step1[5];
1048233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[5] = step1[4] - step1[5];
1049233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[6] = -step1[6] + step1[7];
1050233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[7] = step1[6] + step1[7];
1051233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1052233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[8] = step1[8];
1053233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[15] = step1[15];
1054233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1055233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1056233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[9] = dct_const_round_shift(temp1);
1057233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[14] = dct_const_round_shift(temp2);
1058233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1059233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1060233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[10] = dct_const_round_shift(temp1);
1061233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[13] = dct_const_round_shift(temp2);
1062233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[11] = step1[11];
1063233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[12] = step1[12];
1064233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1065233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[16] = step1[16] + step1[19];
1066233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[17] = step1[17] + step1[18];
1067233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[18] = step1[17] - step1[18];
1068233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[19] = step1[16] - step1[19];
1069233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[20] = -step1[20] + step1[23];
1070233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[21] = -step1[21] + step1[22];
1071233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[22] = step1[21] + step1[22];
1072233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[23] = step1[20] + step1[23];
1073233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1074233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[24] = step1[24] + step1[27];
1075233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[25] = step1[25] + step1[26];
1076233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[26] = step1[25] - step1[26];
1077233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[27] = step1[24] - step1[27];
1078233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[28] = -step1[28] + step1[31];
1079233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[29] = -step1[29] + step1[30];
1080233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[30] = step1[29] + step1[30];
1081233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[31] = step1[28] + step1[31];
1082233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1083233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 5
1084233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[0] = step2[0] + step2[3];
1085233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[1] = step2[1] + step2[2];
1086233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[2] = step2[1] - step2[2];
1087233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[3] = step2[0] - step2[3];
1088233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[4] = step2[4];
1089233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (step2[6] - step2[5]) * cospi_16_64;
1090233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step2[5] + step2[6]) * cospi_16_64;
1091233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[5] = dct_const_round_shift(temp1);
1092233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[6] = dct_const_round_shift(temp2);
1093233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[7] = step2[7];
1094233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1095233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[8] = step2[8] + step2[11];
1096233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[9] = step2[9] + step2[10];
1097233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[10] = step2[9] - step2[10];
1098233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[11] = step2[8] - step2[11];
1099233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[12] = -step2[12] + step2[15];
1100233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[13] = -step2[13] + step2[14];
1101233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[14] = step2[13] + step2[14];
1102233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[15] = step2[12] + step2[15];
1103233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1104233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[16] = step2[16];
1105233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[17] = step2[17];
1106233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1107233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1108233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[18] = dct_const_round_shift(temp1);
1109233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[29] = dct_const_round_shift(temp2);
1110233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1111233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1112233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[19] = dct_const_round_shift(temp1);
1113233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[28] = dct_const_round_shift(temp2);
1114233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1115233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1116233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[20] = dct_const_round_shift(temp1);
1117233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[27] = dct_const_round_shift(temp2);
1118233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1119233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1120233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[21] = dct_const_round_shift(temp1);
1121233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[26] = dct_const_round_shift(temp2);
1122233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[22] = step2[22];
1123233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[23] = step2[23];
1124233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[24] = step2[24];
1125233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[25] = step2[25];
1126233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[30] = step2[30];
1127233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[31] = step2[31];
1128233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1129233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 6
1130233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[0] = step1[0] + step1[7];
1131233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[1] = step1[1] + step1[6];
1132233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[2] = step1[2] + step1[5];
1133233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[3] = step1[3] + step1[4];
1134233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[4] = step1[3] - step1[4];
1135233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[5] = step1[2] - step1[5];
1136233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[6] = step1[1] - step1[6];
1137233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[7] = step1[0] - step1[7];
1138233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[8] = step1[8];
1139233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[9] = step1[9];
1140233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1141233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step1[10] + step1[13]) * cospi_16_64;
1142233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[10] = dct_const_round_shift(temp1);
1143233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[13] = dct_const_round_shift(temp2);
1144233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1145233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step1[11] + step1[12]) * cospi_16_64;
1146233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[11] = dct_const_round_shift(temp1);
1147233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[12] = dct_const_round_shift(temp2);
1148233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[14] = step1[14];
1149233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[15] = step1[15];
1150233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1151233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[16] = step1[16] + step1[23];
1152233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[17] = step1[17] + step1[22];
1153233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[18] = step1[18] + step1[21];
1154233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[19] = step1[19] + step1[20];
1155233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[20] = step1[19] - step1[20];
1156233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[21] = step1[18] - step1[21];
1157233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[22] = step1[17] - step1[22];
1158233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[23] = step1[16] - step1[23];
1159233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1160233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[24] = -step1[24] + step1[31];
1161233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[25] = -step1[25] + step1[30];
1162233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[26] = -step1[26] + step1[29];
1163233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[27] = -step1[27] + step1[28];
1164233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[28] = step1[27] + step1[28];
1165233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[29] = step1[26] + step1[29];
1166233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[30] = step1[25] + step1[30];
1167233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step2[31] = step1[24] + step1[31];
1168233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1169233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // stage 7
1170233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[0] = step2[0] + step2[15];
1171233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[1] = step2[1] + step2[14];
1172233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[2] = step2[2] + step2[13];
1173233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[3] = step2[3] + step2[12];
1174233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[4] = step2[4] + step2[11];
1175233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[5] = step2[5] + step2[10];
1176233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[6] = step2[6] + step2[9];
1177233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[7] = step2[7] + step2[8];
1178233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[8] = step2[7] - step2[8];
1179233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[9] = step2[6] - step2[9];
1180233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[10] = step2[5] - step2[10];
1181233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[11] = step2[4] - step2[11];
1182233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[12] = step2[3] - step2[12];
1183233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[13] = step2[2] - step2[13];
1184233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[14] = step2[1] - step2[14];
1185233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[15] = step2[0] - step2[15];
1186233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1187233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[16] = step2[16];
1188233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[17] = step2[17];
1189233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[18] = step2[18];
1190233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[19] = step2[19];
1191233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1192233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step2[20] + step2[27]) * cospi_16_64;
1193233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[20] = dct_const_round_shift(temp1);
1194233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[27] = dct_const_round_shift(temp2);
1195233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1196233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step2[21] + step2[26]) * cospi_16_64;
1197233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[21] = dct_const_round_shift(temp1);
1198233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[26] = dct_const_round_shift(temp2);
1199233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1200233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step2[22] + step2[25]) * cospi_16_64;
1201233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[22] = dct_const_round_shift(temp1);
1202233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[25] = dct_const_round_shift(temp2);
1203233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1204233d2500723e5594f3e7c70896ffeeef32b9c950ywan  temp2 = (step2[23] + step2[24]) * cospi_16_64;
1205233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[23] = dct_const_round_shift(temp1);
1206233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[24] = dct_const_round_shift(temp2);
1207233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[28] = step2[28];
1208233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[29] = step2[29];
1209233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[30] = step2[30];
1210233d2500723e5594f3e7c70896ffeeef32b9c950ywan  step1[31] = step2[31];
1211233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1212233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // final stage
1213233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[0] = step1[0] + step1[31];
1214233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[1] = step1[1] + step1[30];
1215233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[2] = step1[2] + step1[29];
1216233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[3] = step1[3] + step1[28];
1217233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[4] = step1[4] + step1[27];
1218233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[5] = step1[5] + step1[26];
1219233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[6] = step1[6] + step1[25];
1220233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[7] = step1[7] + step1[24];
1221233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[8] = step1[8] + step1[23];
1222233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[9] = step1[9] + step1[22];
1223233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[10] = step1[10] + step1[21];
1224233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[11] = step1[11] + step1[20];
1225233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[12] = step1[12] + step1[19];
1226233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[13] = step1[13] + step1[18];
1227233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[14] = step1[14] + step1[17];
1228233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[15] = step1[15] + step1[16];
1229233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[16] = step1[15] - step1[16];
1230233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[17] = step1[14] - step1[17];
1231233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[18] = step1[13] - step1[18];
1232233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[19] = step1[12] - step1[19];
1233233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[20] = step1[11] - step1[20];
1234233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[21] = step1[10] - step1[21];
1235233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[22] = step1[9] - step1[22];
1236233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[23] = step1[8] - step1[23];
1237233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[24] = step1[7] - step1[24];
1238233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[25] = step1[6] - step1[25];
1239233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[26] = step1[5] - step1[26];
1240233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[27] = step1[4] - step1[27];
1241233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[28] = step1[3] - step1[28];
1242233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[29] = step1[2] - step1[29];
1243233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[30] = step1[1] - step1[30];
1244233d2500723e5594f3e7c70896ffeeef32b9c950ywan  output[31] = step1[0] - step1[31];
1245233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1246233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1247233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
1248233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out[32 * 32];
1249233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *outptr = out;
1250233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
1251233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t temp_in[32], temp_out[32];
1252233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1253233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Rows
1254233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 32; ++i) {
1255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int16_t zero_coeff[16];
1256233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 16; ++j)
1257233d2500723e5594f3e7c70896ffeeef32b9c950ywan      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1258233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 8; ++j)
1259233d2500723e5594f3e7c70896ffeeef32b9c950ywan      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 4; ++j)
1261233d2500723e5594f3e7c70896ffeeef32b9c950ywan      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1262233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 2; ++j)
1263233d2500723e5594f3e7c70896ffeeef32b9c950ywan      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1264233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1265233d2500723e5594f3e7c70896ffeeef32b9c950ywan    if (zero_coeff[0] | zero_coeff[1])
1266233d2500723e5594f3e7c70896ffeeef32b9c950ywan      idct32(input, outptr);
1267233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else
1268233d2500723e5594f3e7c70896ffeeef32b9c950ywan      vpx_memset(outptr, 0, sizeof(int16_t) * 32);
1269233d2500723e5594f3e7c70896ffeeef32b9c950ywan    input += 32;
1270233d2500723e5594f3e7c70896ffeeef32b9c950ywan    outptr += 32;
1271233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1272233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1273233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Columns
1274233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 32; ++i) {
1275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 32; ++j)
1276233d2500723e5594f3e7c70896ffeeef32b9c950ywan      temp_in[j] = out[j * 32 + i];
1277233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct32(temp_in, temp_out);
1278233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 32; ++j)
1279233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1280233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        + dest[j * stride + i]);
1281233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1282233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1283233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1284233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
1285233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out[32 * 32] = {0};
1286233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t *outptr = out;
1287233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
1288233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t temp_in[32], temp_out[32];
1289233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1290233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Rows
1291233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // only upper-left 8x8 has non-zero coeff
1292233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 8; ++i) {
1293233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct32(input, outptr);
1294233d2500723e5594f3e7c70896ffeeef32b9c950ywan    input += 32;
1295233d2500723e5594f3e7c70896ffeeef32b9c950ywan    outptr += 32;
1296233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1297233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1298233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Columns
1299233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < 32; ++i) {
1300233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 32; ++j)
1301233d2500723e5594f3e7c70896ffeeef32b9c950ywan      temp_in[j] = out[j * 32 + i];
1302233d2500723e5594f3e7c70896ffeeef32b9c950ywan    idct32(temp_in, temp_out);
1303233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < 32; ++j)
1304233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1305233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  + dest[j * stride + i]);
1306233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1307233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1308233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1309233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
1310233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
1311233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int a1;
1312233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1313233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
1314233d2500723e5594f3e7c70896ffeeef32b9c950ywan  out = dct_const_round_shift(out * cospi_16_64);
1315233d2500723e5594f3e7c70896ffeeef32b9c950ywan  a1 = ROUND_POWER_OF_TWO(out, 6);
1316233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1317233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (j = 0; j < 32; ++j) {
1318233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (i = 0; i < 32; ++i)
1319233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dest[i] = clip_pixel(dest[i] + a1);
1320233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dest += stride;
1321233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1322233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1323233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1324233d2500723e5594f3e7c70896ffeeef32b9c950ywan// idct
1325233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1326233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (eob > 1)
1327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct4x4_16_add(input, dest, stride);
1328233d2500723e5594f3e7c70896ffeeef32b9c950ywan  else
1329233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct4x4_1_add(input, dest, stride);
1330233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1331233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1332233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1333233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1334233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (eob > 1)
1335233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_iwht4x4_16_add(input, dest, stride);
1336233d2500723e5594f3e7c70896ffeeef32b9c950ywan  else
1337233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_iwht4x4_1_add(input, dest, stride);
1338233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1339233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1340233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1341233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // If dc is 1, then input[0] is the reconstructed value, do not need
1342233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
1343233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1344233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // The calculation can be simplified if there are not many non-zero dct
1345233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // coefficients. Use eobs to decide what to do.
1346233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
1347233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Combine that with code here.
1348233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (eob == 1)
1349233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // DC only DCT coefficient
1350233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct8x8_1_add(input, dest, stride);
1351233d2500723e5594f3e7c70896ffeeef32b9c950ywan  else if (eob <= 10)
1352233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct8x8_10_add(input, dest, stride);
1353233d2500723e5594f3e7c70896ffeeef32b9c950ywan  else
1354233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct8x8_64_add(input, dest, stride);
1355233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1356233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1357233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
1358233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       int eob) {
1359233d2500723e5594f3e7c70896ffeeef32b9c950ywan  /* The calculation can be simplified if there are not many non-zero dct
1360233d2500723e5594f3e7c70896ffeeef32b9c950ywan   * coefficients. Use eobs to separate different cases. */
1361233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (eob == 1)
1362233d2500723e5594f3e7c70896ffeeef32b9c950ywan    /* DC only DCT coefficient. */
1363233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct16x16_1_add(input, dest, stride);
1364233d2500723e5594f3e7c70896ffeeef32b9c950ywan  else if (eob <= 10)
1365233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct16x16_10_add(input, dest, stride);
1366233d2500723e5594f3e7c70896ffeeef32b9c950ywan  else
1367233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct16x16_256_add(input, dest, stride);
1368233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1369233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1370233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
1371233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       int eob) {
1372233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (eob == 1)
1373233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct32x32_1_add(input, dest, stride);
1374233d2500723e5594f3e7c70896ffeeef32b9c950ywan  else if (eob <= 34)
1375233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // non-zero coeff only in upper-left 8x8
1376233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct32x32_34_add(input, dest, stride);
1377233d2500723e5594f3e7c70896ffeeef32b9c950ywan  else
1378233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct32x32_1024_add(input, dest, stride);
1379233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1380233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1381233d2500723e5594f3e7c70896ffeeef32b9c950ywan// iht
1382233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1383233d2500723e5594f3e7c70896ffeeef32b9c950ywan                    int stride, int eob) {
1384233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (tx_type == DCT_DCT)
1385233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct4x4_add(input, dest, stride, eob);
1386233d2500723e5594f3e7c70896ffeeef32b9c950ywan  else
1387233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_iht4x4_16_add(input, dest, stride, tx_type);
1388233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1389233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1390233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1391233d2500723e5594f3e7c70896ffeeef32b9c950ywan                    int stride, int eob) {
1392233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (tx_type == DCT_DCT) {
1393233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct8x8_add(input, dest, stride, eob);
1394233d2500723e5594f3e7c70896ffeeef32b9c950ywan  } else {
1395233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_iht8x8_64_add(input, dest, stride, tx_type);
1396233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1397233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1398233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1399233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1400233d2500723e5594f3e7c70896ffeeef32b9c950ywan                      int stride, int eob) {
1401233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (tx_type == DCT_DCT) {
1402233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_idct16x16_add(input, dest, stride, eob);
1403233d2500723e5594f3e7c70896ffeeef32b9c950ywan  } else {
1404233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_iht16x16_256_add(input, dest, stride, tx_type);
1405233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1406233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1407