1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/*
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <assert.h>
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <math.h>
135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "./vpx_config.h"
155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vp9_rtcd.h"
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_blockd.h"
18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp9/common/vp9_idct.h"
195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_systemdependent.h"
20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
212ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianstatic INLINE int fdct_round_shift(int input) {
222ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
232ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  assert(INT16_MIN <= rv && rv <= INT16_MAX);
242ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  return rv;
252ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void fdct4(const int16_t *input, int16_t *output) {
28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t step[4];
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int temp1, temp2;
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[0] = input[0] + input[3];
32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[1] = input[1] + input[2];
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[2] = input[1] - input[2];
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[3] = input[0] - input[3];
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (step[0] + step[1]) * cospi_16_64;
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step[0] - step[1]) * cospi_16_64;
382ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[0] = fdct_round_shift(temp1);
392ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[2] = fdct_round_shift(temp2);
40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
422ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[1] = fdct_round_shift(temp1);
432ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[3] = fdct_round_shift(temp2);
44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
46ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
47ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  int r, c;
48ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  int16_t sum = 0;
49ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  for (r = 0; r < 4; ++r)
50ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    for (c = 0; c < 4; ++c)
51ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      sum += input[r * stride + c];
52ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
53ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  output[0] = sum << 1;
54ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  output[1] = 0;
55ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
56ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // The 2D transform is done with two passes which are actually pretty
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // similar. In the first one, we transform the columns and transpose
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // the results. In the second one, we transform the rows. To achieve that,
612ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // as the first pass results are transposed, we transpose the columns (that
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // is the transposed rows) and transpose the results (so that it goes back
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // in normal/row positions).
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int pass;
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // We need an intermediate buffer between passes.
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t intermediate[4 * 4];
675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int16_t *in = input;
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *out = intermediate;
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Do the two transform/transpose passes
70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (pass = 0; pass < 2; ++pass) {
71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*canbe16*/ int input[4];
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*canbe16*/ int step[4];
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*needs32*/ int temp1, temp2;
74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    int i;
75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (i = 0; i < 4; ++i) {
76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Load inputs.
77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      if (0 == pass) {
785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[0] = in[0 * stride] * 16;
795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[1] = in[1 * stride] * 16;
805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[2] = in[2 * stride] * 16;
815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[3] = in[3 * stride] * 16;
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        if (i == 0 && input[0]) {
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang          input[0] += 1;
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        }
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      } else {
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[0] = in[0 * 4];
87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[1] = in[1 * 4];
88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[2] = in[2 * 4];
89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[3] = in[3 * 4];
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      }
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Transform.
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      step[0] = input[0] + input[3];
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      step[1] = input[1] + input[2];
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      step[2] = input[1] - input[2];
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      step[3] = input[0] - input[3];
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp1 = (step[0] + step[1]) * cospi_16_64;
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp2 = (step[0] - step[1]) * cospi_16_64;
982ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      out[0] = fdct_round_shift(temp1);
992ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      out[2] = fdct_round_shift(temp2);
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
1022ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      out[1] = fdct_round_shift(temp1);
1032ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      out[3] = fdct_round_shift(temp2);
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Do next column (which is a transposed row in second/horizontal pass)
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in++;
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      out += 4;
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Setup in/out for next pass.
109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in = intermediate;
110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out = output;
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    int i, j;
115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (i = 0; i < 4; ++i) {
116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      for (j = 0; j < 4; ++j)
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void fadst4(const int16_t *input, int16_t *output) {
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x0, x1, x2, x3;
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int s0, s1, s2, s3, s4, s5, s6, s7;
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = input[0];
127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = input[1];
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = input[2];
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = input[3];
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  if (!(x0 | x1 | x2 | x3)) {
132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    output[0] = output[1] = output[2] = output[3] = 0;
133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    return;
134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = sinpi_1_9 * x0;
137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = sinpi_4_9 * x0;
138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = sinpi_2_9 * x1;
139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = sinpi_1_9 * x1;
140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = sinpi_3_9 * x2;
141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = sinpi_4_9 * x3;
142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = sinpi_2_9 * x3;
143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = x0 + x1 - x3;
144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = s0 + s2 + s5;
146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = sinpi_3_9 * s7;
147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = s1 - s3 + s6;
148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = s4;
149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = x0 + x3;
151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = x1;
152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = x2 - x3;
153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = x2 - x0 + x3;
154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // 1-D transform scaling factor is sqrt(2).
1562ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[0] = fdct_round_shift(s0);
1572ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[1] = fdct_round_shift(s1);
1582ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[2] = fdct_round_shift(s2);
1592ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[3] = fdct_round_shift(s3);
160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangstatic const transform_2d FHT_4[] = {
1635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fdct4,  fdct4  },  // DCT_DCT  = 0
1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fadst4, fdct4  },  // ADST_DCT = 1
1655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fdct4,  fadst4 },  // DCT_ADST = 2
1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fadst4, fadst4 }   // ADST_ADST = 3
167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang};
168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1692ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianvoid vp9_fht4x4_c(const int16_t *input, int16_t *output,
1702ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                  int stride, int tx_type) {
1712ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  if (tx_type == DCT_DCT) {
1722ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    vp9_fdct4x4_c(input, output, stride);
1732ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  } else {
1742ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int16_t out[4 * 4];
1752ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int16_t *outptr = &out[0];
1762ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int i, j;
1772ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int16_t temp_in[4], temp_out[4];
1782ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    const transform_2d ht = FHT_4[tx_type];
179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1802ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    // Columns
1812ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    for (i = 0; i < 4; ++i) {
1822ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 4; ++j)
1832ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        temp_in[j] = input[j * stride + i] * 16;
1842ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      if (i == 0 && temp_in[0])
1852ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        temp_in[0] += 1;
1862ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      ht.cols(temp_in, temp_out);
1872ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 4; ++j)
1882ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        outptr[j * 4 + i] = temp_out[j];
1892ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    }
190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1912ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    // Rows
1922ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    for (i = 0; i < 4; ++i) {
1932ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 4; ++j)
1942ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        temp_in[j] = out[j + i * 4];
1952ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      ht.rows(temp_in, temp_out);
1962ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 4; ++j)
1972ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        output[j + i * 4] = (temp_out[j] + 1) >> 2;
1982ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    }
199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void fdct8(const int16_t *input, int16_t *output) {
203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /*needs32*/ int t0, t1, t2, t3;
205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /*canbe16*/ int x0, x1, x2, x3;
206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 1
208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = input[0] + input[7];
209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = input[1] + input[6];
210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = input[2] + input[5];
211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = input[3] + input[4];
212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = input[3] - input[4];
213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = input[2] - input[5];
214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = input[1] - input[6];
215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = input[0] - input[7];
216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // fdct4(step, step);
218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = s0 + s3;
219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = s1 + s2;
220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = s1 - s2;
221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = s0 - s3;
222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  t0 = (x0 + x1) * cospi_16_64;
223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  t1 = (x0 - x1) * cospi_16_64;
224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
2262ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[0] = fdct_round_shift(t0);
2272ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[2] = fdct_round_shift(t2);
2282ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[4] = fdct_round_shift(t1);
2292ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[6] = fdct_round_shift(t3);
230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  t0 = (s6 - s5) * cospi_16_64;
233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  t1 = (s6 + s5) * cospi_16_64;
2342ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  t2 = fdct_round_shift(t0);
2352ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  t3 = fdct_round_shift(t1);
236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 3
238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = s4 + t2;
239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = s4 - t2;
240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = s7 - t3;
241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = s7 + t3;
242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 4
244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
2482ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[1] = fdct_round_shift(t0);
2492ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[3] = fdct_round_shift(t2);
2502ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[5] = fdct_round_shift(t1);
2512ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  output[7] = fdct_round_shift(t3);
252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  int r, c;
256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  int16_t sum = 0;
257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  for (r = 0; r < 8; ++r)
258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    for (c = 0; c < 8; ++c)
259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      sum += input[r * stride + c];
260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  output[0] = sum;
262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  output[1] = 0;
263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t intermediate[64];
268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Transform columns
270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    int16_t *output = intermediate;
272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*needs32*/ int t0, t1, t2, t3;
274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*canbe16*/ int x0, x1, x2, x3;
275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    int i;
277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (i = 0; i < 8; i++) {
278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // stage 1
2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      s0 = (input[0 * stride] + input[7 * stride]) * 4;
2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      s1 = (input[1 * stride] + input[6 * stride]) * 4;
2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      s2 = (input[2 * stride] + input[5 * stride]) * 4;
2825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      s3 = (input[3 * stride] + input[4 * stride]) * 4;
2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      s4 = (input[3 * stride] - input[4 * stride]) * 4;
2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      s5 = (input[2 * stride] - input[5 * stride]) * 4;
2855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      s6 = (input[1 * stride] - input[6 * stride]) * 4;
2865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      s7 = (input[0 * stride] - input[7 * stride]) * 4;
2875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // fdct4(step, step);
289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      x0 = s0 + s3;
290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      x1 = s1 + s2;
291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      x2 = s1 - s2;
292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      x3 = s0 - s3;
293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      t0 = (x0 + x1) * cospi_16_64;
294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      t1 = (x0 - x1) * cospi_16_64;
295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
2972ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      output[0 * 8] = fdct_round_shift(t0);
2982ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      output[2 * 8] = fdct_round_shift(t2);
2992ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      output[4 * 8] = fdct_round_shift(t1);
3002ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      output[6 * 8] = fdct_round_shift(t3);
301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Stage 2
303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      t0 = (s6 - s5) * cospi_16_64;
304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      t1 = (s6 + s5) * cospi_16_64;
3052ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      t2 = fdct_round_shift(t0);
3062ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      t3 = fdct_round_shift(t1);
307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Stage 3
309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      x0 = s4 + t2;
310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      x1 = s4 - t2;
311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      x2 = s7 - t3;
312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      x3 = s7 + t3;
313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Stage 4
315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
3192ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      output[1 * 8] = fdct_round_shift(t0);
3202ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      output[3 * 8] = fdct_round_shift(t2);
3212ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      output[5 * 8] = fdct_round_shift(t1);
3222ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      output[7 * 8] = fdct_round_shift(t3);
323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      input++;
324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      output++;
325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows
329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 8; ++i) {
3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    fdct8(&intermediate[i * 8], &final_output[i * 8]);
331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 8; ++j)
332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      final_output[j + i * 8] /= 2;
333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  int r, c;
338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  int16_t sum = 0;
339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  for (r = 0; r < 16; ++r)
340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    for (c = 0; c < 16; ++c)
341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      sum += input[r * stride + c];
342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  output[0] = sum >> 1;
344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  output[1] = 0;
345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // The 2D transform is done with two passes which are actually pretty
349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // similar. In the first one, we transform the columns and transpose
350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // the results. In the second one, we transform the rows. To achieve that,
3512ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // as the first pass results are transposed, we transpose the columns (that
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // is the transposed rows) and transpose the results (so that it goes back
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // in normal/row positions).
354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int pass;
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // We need an intermediate buffer between passes.
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t intermediate[256];
3575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int16_t *in = input;
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int16_t *out = intermediate;
359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Do the two transform/transpose passes
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (pass = 0; pass < 2; ++pass) {
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*canbe16*/ int step1[8];
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*canbe16*/ int step2[8];
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*canbe16*/ int step3[8];
364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*canbe16*/ int input[8];
365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*needs32*/ int temp1, temp2;
366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    int i;
367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (i = 0; i < 16; i++) {
368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      if (0 == pass) {
369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // Calculate input for the first 8 results.
3705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[0] = (in[0 * stride] + in[15 * stride]) * 4;
3715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[1] = (in[1 * stride] + in[14 * stride]) * 4;
3725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[2] = (in[2 * stride] + in[13 * stride]) * 4;
3735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[3] = (in[3 * stride] + in[12 * stride]) * 4;
3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[4] = (in[4 * stride] + in[11 * stride]) * 4;
3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[5] = (in[5 * stride] + in[10 * stride]) * 4;
3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // Calculate input for the next 8 results.
3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
3805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
3815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
3825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
3835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
3845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
3865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      } else {
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // Calculate input for the first 8 results.
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // Calculate input for the next 8 results.
398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      }
4075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      // Work on the first eight values; fdct8(input, even_results);
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      {
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        /*needs32*/ int t0, t1, t2, t3;
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        /*canbe16*/ int x0, x1, x2, x3;
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // stage 1
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        s0 = input[0] + input[7];
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        s1 = input[1] + input[6];
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        s2 = input[2] + input[5];
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        s3 = input[3] + input[4];
418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        s4 = input[3] - input[4];
419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        s5 = input[2] - input[5];
420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        s6 = input[1] - input[6];
421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        s7 = input[0] - input[7];
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
4235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // fdct4(step, step);
424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        x0 = s0 + s3;
425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        x1 = s1 + s2;
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        x2 = s1 - s2;
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        x3 = s0 - s3;
428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        t0 = (x0 + x1) * cospi_16_64;
429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        t1 = (x0 - x1) * cospi_16_64;
430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
4322ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[0] = fdct_round_shift(t0);
4332ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[4] = fdct_round_shift(t2);
4342ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[8] = fdct_round_shift(t1);
4352ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[12] = fdct_round_shift(t3);
436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // Stage 2
438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        t0 = (s6 - s5) * cospi_16_64;
439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        t1 = (s6 + s5) * cospi_16_64;
4402ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        t2 = fdct_round_shift(t0);
4412ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        t3 = fdct_round_shift(t1);
442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // Stage 3
444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        x0 = s4 + t2;
445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        x1 = s4 - t2;
446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        x2 = s7 - t3;
447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        x3 = s7 + t3;
448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // Stage 4
450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
4542ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[2] = fdct_round_shift(t0);
4552ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[6] = fdct_round_shift(t2);
4562ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[10] = fdct_round_shift(t1);
4572ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[14] = fdct_round_shift(t3);
458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      }
459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Work on the next eight values; step1 -> odd_results
460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      {
461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // step 2
462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp1 = (step1[5] - step1[2]) * cospi_16_64;
463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp2 = (step1[4] - step1[3]) * cospi_16_64;
4642ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        step2[2] = fdct_round_shift(temp1);
4652ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        step2[3] = fdct_round_shift(temp2);
466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp1 = (step1[4] + step1[3]) * cospi_16_64;
467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp2 = (step1[5] + step1[2]) * cospi_16_64;
4682ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        step2[4] = fdct_round_shift(temp1);
4692ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        step2[5] = fdct_round_shift(temp2);
470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // step 3
471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step3[0] = step1[0] + step2[3];
472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step3[1] = step1[1] + step2[2];
473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step3[2] = step1[1] - step2[2];
474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step3[3] = step1[0] - step2[3];
475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step3[4] = step1[7] - step2[4];
476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step3[5] = step1[6] - step2[5];
477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step3[6] = step1[6] + step2[5];
478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step3[7] = step1[7] + step2[4];
479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // step 4
480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
4822ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        step2[1] = fdct_round_shift(temp1);
4832ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        step2[2] = fdct_round_shift(temp2);
484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
4862ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        step2[5] = fdct_round_shift(temp1);
4872ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        step2[6] = fdct_round_shift(temp2);
488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // step 5
489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[0] = step3[0] + step2[1];
490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[1] = step3[0] - step2[1];
491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        step1[2] = step3[3] + step2[2];
492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        step1[3] = step3[3] - step2[2];
493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        step1[4] = step3[4] - step2[5];
494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        step1[5] = step3[4] + step2[5];
495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[6] = step3[7] - step2[6];
496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        step1[7] = step3[7] + step2[6];
497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        // step 6
498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
5002ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[1] = fdct_round_shift(temp1);
5012ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[9] = fdct_round_shift(temp2);
502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
5042ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[5] = fdct_round_shift(temp1);
5052ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[13] = fdct_round_shift(temp2);
506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
5082ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[3] = fdct_round_shift(temp1);
5092ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[11] = fdct_round_shift(temp2);
510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
5122ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[7] = fdct_round_shift(temp1);
5132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        out[15] = fdct_round_shift(temp2);
514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      }
515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      // Do next column (which is a transposed row in second/horizontal pass)
516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      in++;
517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      out += 16;
518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    }
519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Setup in/out for next pass.
520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    in = intermediate;
521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    out = output;
522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void fadst8(const int16_t *input, int16_t *output) {
526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int s0, s1, s2, s3, s4, s5, s6, s7;
527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x0 = input[7];
529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x1 = input[0];
530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x2 = input[5];
531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x3 = input[2];
532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x4 = input[3];
533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x5 = input[4];
534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x6 = input[1];
535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x7 = input[6];
536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 1
538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
5472ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x0 = fdct_round_shift(s0 + s4);
5482ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x1 = fdct_round_shift(s1 + s5);
5492ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x2 = fdct_round_shift(s2 + s6);
5502ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x3 = fdct_round_shift(s3 + s7);
5512ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x4 = fdct_round_shift(s0 - s4);
5522ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x5 = fdct_round_shift(s1 - s5);
5532ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x6 = fdct_round_shift(s2 - s6);
5542ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x7 = fdct_round_shift(s3 - s7);
555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 2
557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = x0;
558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = x1;
559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = x2;
560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = x3;
561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = s0 + s2;
567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = s1 + s3;
568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = s0 - s2;
569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = s1 - s3;
5702ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x4 = fdct_round_shift(s4 + s6);
5712ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x5 = fdct_round_shift(s5 + s7);
5722ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x6 = fdct_round_shift(s4 - s6);
5732ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x7 = fdct_round_shift(s5 - s7);
574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 3
576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = cospi_16_64 * (x2 + x3);
577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = cospi_16_64 * (x2 - x3);
578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = cospi_16_64 * (x6 + x7);
579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = cospi_16_64 * (x6 - x7);
580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
5812ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x2 = fdct_round_shift(s2);
5822ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x3 = fdct_round_shift(s3);
5832ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x6 = fdct_round_shift(s6);
5842ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x7 = fdct_round_shift(s7);
585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] =   x0;
587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = - x4;
588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] =   x6;
589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = - x2;
590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[4] =   x3;
591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[5] = - x7;
592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[6] =   x5;
593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[7] = - x1;
594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangstatic const transform_2d FHT_8[] = {
5975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fdct8,  fdct8  },  // DCT_DCT  = 0
5985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fadst8, fdct8  },  // ADST_DCT = 1
5995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fdct8,  fadst8 },  // DCT_ADST = 2
6005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fadst8, fadst8 }   // ADST_ADST = 3
601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang};
602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
6032ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianvoid vp9_fht8x8_c(const int16_t *input, int16_t *output,
6042ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                  int stride, int tx_type) {
6052ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  if (tx_type == DCT_DCT) {
6062ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    vp9_fdct8x8_c(input, output, stride);
6072ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  } else {
6082ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int16_t out[64];
6092ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int16_t *outptr = &out[0];
6102ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int i, j;
6112ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int16_t temp_in[8], temp_out[8];
6122ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    const transform_2d ht = FHT_8[tx_type];
6132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
6142ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    // Columns
6152ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    for (i = 0; i < 8; ++i) {
6162ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 8; ++j)
6172ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        temp_in[j] = input[j * stride + i] * 4;
6182ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      ht.cols(temp_in, temp_out);
6192ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 8; ++j)
6202ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        outptr[j * 8 + i] = temp_out[j];
6212ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    }
622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
6232ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    // Rows
6242ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    for (i = 0; i < 8; ++i) {
6252ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 8; ++j)
6262ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        temp_in[j] = out[j + i * 8];
6272ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      ht.rows(temp_in, temp_out);
6282ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 8; ++j)
6292ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
6302ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    }
631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang   pixel. */
6365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i;
638ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int a1, b1, c1, d1, e1;
6395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int16_t *ip = input;
6405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int16_t *op = output;
641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; i++) {
6435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    a1 = ip[0 * stride];
6445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    b1 = ip[1 * stride];
6455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    c1 = ip[2 * stride];
6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    d1 = ip[3 * stride];
647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    a1 += b1;
649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    d1 = d1 - c1;
650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    e1 = (a1 - d1) >> 1;
651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    b1 = e1 - b1;
652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    c1 = e1 - c1;
653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    a1 -= c1;
654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    d1 += b1;
655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    op[0] = a1;
656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    op[4] = c1;
657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    op[8] = d1;
658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    op[12] = b1;
659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ip++;
661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    op++;
662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  ip = output;
664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  op = output;
665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 4; i++) {
667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    a1 = ip[0];
668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    b1 = ip[1];
669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    c1 = ip[2];
670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    d1 = ip[3];
671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    a1 += b1;
673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    d1 -= c1;
674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    e1 = (a1 - d1) >> 1;
675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    b1 = e1 - b1;
676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    c1 = e1 - c1;
677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    a1 -= c1;
678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    d1 += b1;
6795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    op[0] = a1 * UNIT_QUANT_FACTOR;
6805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    op[1] = c1 * UNIT_QUANT_FACTOR;
6815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    op[2] = d1 * UNIT_QUANT_FACTOR;
6825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    op[3] = b1 * UNIT_QUANT_FACTOR;
683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ip += 4;
685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    op += 4;
686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang// Rewrote to use same algorithm as others.
6905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void fdct16(const int16_t in[16], int16_t out[16]) {
691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /*canbe16*/ int step1[8];
692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /*canbe16*/ int step2[8];
693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /*canbe16*/ int step3[8];
694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /*canbe16*/ int input[8];
695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /*needs32*/ int temp1, temp2;
696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // step 1
698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input[0] = in[0] + in[15];
699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input[1] = in[1] + in[14];
700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input[2] = in[2] + in[13];
701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input[3] = in[3] + in[12];
702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input[4] = in[4] + in[11];
703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input[5] = in[5] + in[10];
704ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input[6] = in[6] + in[ 9];
705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  input[7] = in[7] + in[ 8];
706ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[0] = in[7] - in[ 8];
708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[1] = in[6] - in[ 9];
709ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[2] = in[5] - in[10];
710ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[3] = in[4] - in[11];
711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[4] = in[3] - in[12];
712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[5] = in[2] - in[13];
713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[6] = in[1] - in[14];
714ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[7] = in[0] - in[15];
715ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
7165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  // fdct8(step, step);
717ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  {
718ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
719ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*needs32*/ int t0, t1, t2, t3;
720ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    /*canbe16*/ int x0, x1, x2, x3;
721ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
722ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // stage 1
723ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    s0 = input[0] + input[7];
724ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    s1 = input[1] + input[6];
725ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    s2 = input[2] + input[5];
726ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    s3 = input[3] + input[4];
727ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    s4 = input[3] - input[4];
728ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    s5 = input[2] - input[5];
729ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    s6 = input[1] - input[6];
730ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    s7 = input[0] - input[7];
731ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
7325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    // fdct4(step, step);
733ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    x0 = s0 + s3;
734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    x1 = s1 + s2;
735ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    x2 = s1 - s2;
736ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    x3 = s0 - s3;
737ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    t0 = (x0 + x1) * cospi_16_64;
738ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    t1 = (x0 - x1) * cospi_16_64;
739ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
740ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
7412ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    out[0] = fdct_round_shift(t0);
7422ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    out[4] = fdct_round_shift(t2);
7432ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    out[8] = fdct_round_shift(t1);
7442ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    out[12] = fdct_round_shift(t3);
745ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
746ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage 2
747ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    t0 = (s6 - s5) * cospi_16_64;
748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    t1 = (s6 + s5) * cospi_16_64;
7492ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    t2 = fdct_round_shift(t0);
7502ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    t3 = fdct_round_shift(t1);
751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage 3
753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    x0 = s4 + t2;
754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    x1 = s4 - t2;
755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    x2 = s7 - t3;
756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    x3 = s7 + t3;
757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    // Stage 4
759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
7632ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    out[2] = fdct_round_shift(t0);
7642ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    out[6] = fdct_round_shift(t2);
7652ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    out[10] = fdct_round_shift(t1);
7662ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    out[14] = fdct_round_shift(t3);
767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // step 2
770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (step1[5] - step1[2]) * cospi_16_64;
771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step1[4] - step1[3]) * cospi_16_64;
7722ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  step2[2] = fdct_round_shift(temp1);
7732ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  step2[3] = fdct_round_shift(temp2);
774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = (step1[4] + step1[3]) * cospi_16_64;
775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = (step1[5] + step1[2]) * cospi_16_64;
7762ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  step2[4] = fdct_round_shift(temp1);
7772ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  step2[5] = fdct_round_shift(temp2);
778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // step 3
780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step3[0] = step1[0] + step2[3];
781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step3[1] = step1[1] + step2[2];
782ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step3[2] = step1[1] - step2[2];
783ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step3[3] = step1[0] - step2[3];
784ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step3[4] = step1[7] - step2[4];
785ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step3[5] = step1[6] - step2[5];
786ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step3[6] = step1[6] + step2[5];
787ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step3[7] = step1[7] + step2[4];
788ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
789ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // step 4
790ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
7922ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  step2[1] = fdct_round_shift(temp1);
7932ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  step2[2] = fdct_round_shift(temp2);
794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
795ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
7962ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  step2[5] = fdct_round_shift(temp1);
7972ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  step2[6] = fdct_round_shift(temp2);
798ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
799ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // step 5
800ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[0] = step3[0] + step2[1];
801ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[1] = step3[0] - step2[1];
802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  step1[2] = step3[3] + step2[2];
803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  step1[3] = step3[3] - step2[2];
804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  step1[4] = step3[4] - step2[5];
805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  step1[5] = step3[4] + step2[5];
806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[6] = step3[7] - step2[6];
807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step1[7] = step3[7] + step2[6];
808ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
809ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // step 6
810ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
811ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
8122ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  out[1] = fdct_round_shift(temp1);
8132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  out[9] = fdct_round_shift(temp2);
814ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
815ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
816ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
8172ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  out[5] = fdct_round_shift(temp1);
8182ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  out[13] = fdct_round_shift(temp2);
819ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
820ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
821ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
8222ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  out[3] = fdct_round_shift(temp1);
8232ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  out[11] = fdct_round_shift(temp2);
824ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
825ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
826ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
8272ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  out[7] = fdct_round_shift(temp1);
8282ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  out[15] = fdct_round_shift(temp2);
829ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
830ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
8315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void fadst16(const int16_t *input, int16_t *output) {
832ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
833ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
834ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x0 = input[15];
835ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x1 = input[0];
836ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x2 = input[13];
837ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x3 = input[2];
838ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x4 = input[11];
839ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x5 = input[4];
840ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x6 = input[9];
841ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x7 = input[6];
842ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x8 = input[7];
843ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x9 = input[8];
844ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x10 = input[5];
845ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x11 = input[10];
846ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x12 = input[3];
847ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x13 = input[12];
848ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x14 = input[1];
849ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int x15 = input[14];
850ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
851ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 1
852ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
853ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
854ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
855ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
856ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
857ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
858ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
859ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
860ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
861ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
862ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
863ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
864ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
865ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
866ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
867ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
868ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
8692ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x0 = fdct_round_shift(s0 + s8);
8702ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x1 = fdct_round_shift(s1 + s9);
8712ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x2 = fdct_round_shift(s2 + s10);
8722ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x3 = fdct_round_shift(s3 + s11);
8732ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x4 = fdct_round_shift(s4 + s12);
8742ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x5 = fdct_round_shift(s5 + s13);
8752ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x6 = fdct_round_shift(s6 + s14);
8762ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x7 = fdct_round_shift(s7 + s15);
8772ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x8  = fdct_round_shift(s0 - s8);
8782ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x9  = fdct_round_shift(s1 - s9);
8792ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x10 = fdct_round_shift(s2 - s10);
8802ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x11 = fdct_round_shift(s3 - s11);
8812ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x12 = fdct_round_shift(s4 - s12);
8822ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x13 = fdct_round_shift(s5 - s13);
8832ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x14 = fdct_round_shift(s6 - s14);
8842ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x15 = fdct_round_shift(s7 - s15);
885ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
886ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 2
887ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = x0;
888ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = x1;
889ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = x2;
890ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = x3;
891ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = x4;
892ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = x5;
893ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = x6;
894ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = x7;
895ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
896ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
897ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
898ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
899ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
900ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
901ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
902ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
903ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
904ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = s0 + s4;
905ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = s1 + s5;
906ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = s2 + s6;
907ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = s3 + s7;
908ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x4 = s0 - s4;
909ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x5 = s1 - s5;
910ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x6 = s2 - s6;
911ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x7 = s3 - s7;
9122ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x8 = fdct_round_shift(s8 + s12);
9132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x9 = fdct_round_shift(s9 + s13);
9142ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x10 = fdct_round_shift(s10 + s14);
9152ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x11 = fdct_round_shift(s11 + s15);
9162ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x12 = fdct_round_shift(s8 - s12);
9172ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x13 = fdct_round_shift(s9 - s13);
9182ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x14 = fdct_round_shift(s10 - s14);
9192ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x15 = fdct_round_shift(s11 - s15);
920ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
921ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 3
922ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s0 = x0;
923ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s1 = x1;
924ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = x2;
925ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = x3;
926ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
927ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
928ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
929ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
930ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s8 = x8;
931ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s9 = x9;
932ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s10 = x10;
933ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s11 = x11;
934ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
935ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
936ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
937ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
938ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
939ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = s0 + s2;
940ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = s1 + s3;
941ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x2 = s0 - s2;
942ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x3 = s1 - s3;
9432ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x4 = fdct_round_shift(s4 + s6);
9442ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x5 = fdct_round_shift(s5 + s7);
9452ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x6 = fdct_round_shift(s4 - s6);
9462ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x7 = fdct_round_shift(s5 - s7);
947ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x8 = s8 + s10;
948ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x9 = s9 + s11;
949ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x10 = s8 - s10;
950ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x11 = s9 - s11;
9512ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x12 = fdct_round_shift(s12 + s14);
9522ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x13 = fdct_round_shift(s13 + s15);
9532ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x14 = fdct_round_shift(s12 - s14);
9542ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x15 = fdct_round_shift(s13 - s15);
955ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
956ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // stage 4
957ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s2 = (- cospi_16_64) * (x2 + x3);
958ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s3 = cospi_16_64 * (x2 - x3);
959ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s6 = cospi_16_64 * (x6 + x7);
960ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s7 = cospi_16_64 * (- x6 + x7);
961ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s10 = cospi_16_64 * (x10 + x11);
962ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s11 = cospi_16_64 * (- x10 + x11);
963ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s14 = (- cospi_16_64) * (x14 + x15);
964ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  s15 = cospi_16_64 * (x14 - x15);
965ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
9662ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x2 = fdct_round_shift(s2);
9672ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x3 = fdct_round_shift(s3);
9682ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x6 = fdct_round_shift(s6);
9692ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x7 = fdct_round_shift(s7);
9702ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x10 = fdct_round_shift(s10);
9712ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x11 = fdct_round_shift(s11);
9722ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x14 = fdct_round_shift(s14);
9732ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  x15 = fdct_round_shift(s15);
974ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
975ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] = x0;
976ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = - x8;
977ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] = x12;
978ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = - x4;
979ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[4] = x6;
980ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[5] = x14;
981ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[6] = x10;
982ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[7] = x2;
983ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[8] = x3;
984ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[9] =  x11;
985ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[10] = x15;
986ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[11] = x7;
987ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[12] = x5;
988ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[13] = - x13;
989ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[14] = x9;
990ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[15] = - x1;
991ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
992ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
993ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangstatic const transform_2d FHT_16[] = {
9945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fdct16,  fdct16  },  // DCT_DCT  = 0
9955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fadst16, fdct16  },  // ADST_DCT = 1
9965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fdct16,  fadst16 },  // DCT_ADST = 2
9975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  { fadst16, fadst16 }   // ADST_ADST = 3
998ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang};
999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
10002ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianvoid vp9_fht16x16_c(const int16_t *input, int16_t *output,
10012ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                    int stride, int tx_type) {
10022ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  if (tx_type == DCT_DCT) {
10032ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    vp9_fdct16x16_c(input, output, stride);
10042ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  } else {
10052ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int16_t out[256];
10062ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int16_t *outptr = &out[0];
10072ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int i, j;
10082ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    int16_t temp_in[16], temp_out[16];
10092ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    const transform_2d ht = FHT_16[tx_type];
10102ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
10112ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    // Columns
10122ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    for (i = 0; i < 16; ++i) {
10132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 16; ++j)
10142ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        temp_in[j] = input[j * stride + i] * 4;
10152ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      ht.cols(temp_in, temp_out);
10162ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 16; ++j)
10172ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
10182ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    }
1019ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
10202ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    // Rows
10212ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    for (i = 0; i < 16; ++i) {
10222ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 16; ++j)
10232ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        temp_in[j] = out[j + i * 16];
10242ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      ht.rows(temp_in, temp_out);
10252ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      for (j = 0; j < 16; ++j)
10262ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian        output[j + i * 16] = temp_out[j];
10272ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    }
1028ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangstatic INLINE int dct_32_round(int input) {
1032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
1033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  assert(-131072 <= rv && rv <= 131071);
1034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  return rv;
1035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1037ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangstatic INLINE int half_round_shift(int input) {
1038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int rv = (input + 1 + (input < 0)) >> 2;
1039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  return rv;
1040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
10422ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianstatic void fdct32(const int *input, int *output, int round) {
1043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int step[32];
1044ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 1
1045ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[0] = input[0] + input[(32 - 1)];
1046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[1] = input[1] + input[(32 - 2)];
1047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[2] = input[2] + input[(32 - 3)];
1048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[3] = input[3] + input[(32 - 4)];
1049ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[4] = input[4] + input[(32 - 5)];
1050ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[5] = input[5] + input[(32 - 6)];
1051ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[6] = input[6] + input[(32 - 7)];
1052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[7] = input[7] + input[(32 - 8)];
1053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[8] = input[8] + input[(32 - 9)];
1054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[9] = input[9] + input[(32 - 10)];
1055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[10] = input[10] + input[(32 - 11)];
1056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[11] = input[11] + input[(32 - 12)];
1057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[12] = input[12] + input[(32 - 13)];
1058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[13] = input[13] + input[(32 - 14)];
1059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[14] = input[14] + input[(32 - 15)];
1060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[15] = input[15] + input[(32 - 16)];
1061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[16] = -input[16] + input[(32 - 17)];
1062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[17] = -input[17] + input[(32 - 18)];
1063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[18] = -input[18] + input[(32 - 19)];
1064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[19] = -input[19] + input[(32 - 20)];
1065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[20] = -input[20] + input[(32 - 21)];
1066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[21] = -input[21] + input[(32 - 22)];
1067ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[22] = -input[22] + input[(32 - 23)];
1068ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[23] = -input[23] + input[(32 - 24)];
1069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[24] = -input[24] + input[(32 - 25)];
1070ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[25] = -input[25] + input[(32 - 26)];
1071ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[26] = -input[26] + input[(32 - 27)];
1072ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[27] = -input[27] + input[(32 - 28)];
1073ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[28] = -input[28] + input[(32 - 29)];
1074ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[29] = -input[29] + input[(32 - 30)];
1075ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[30] = -input[30] + input[(32 - 31)];
1076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[31] = -input[31] + input[(32 - 32)];
1077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 2
1079ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] = step[0] + step[16 - 1];
1080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = step[1] + step[16 - 2];
1081ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] = step[2] + step[16 - 3];
1082ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = step[3] + step[16 - 4];
1083ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[4] = step[4] + step[16 - 5];
1084ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[5] = step[5] + step[16 - 6];
1085ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[6] = step[6] + step[16 - 7];
1086ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[7] = step[7] + step[16 - 8];
1087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[8] = -step[8] + step[16 - 9];
1088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[9] = -step[9] + step[16 - 10];
1089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[10] = -step[10] + step[16 - 11];
1090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[11] = -step[11] + step[16 - 12];
1091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[12] = -step[12] + step[16 - 13];
1092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[13] = -step[13] + step[16 - 14];
1093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[14] = -step[14] + step[16 - 15];
1094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[15] = -step[15] + step[16 - 16];
1095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1096ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[16] = step[16];
1097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[17] = step[17];
1098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[18] = step[18];
1099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[19] = step[19];
1100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
1102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
1103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
1104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
1105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
1107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
1108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
1109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
1110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[28] = step[28];
1112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[29] = step[29];
1113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[30] = step[30];
1114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[31] = step[31];
1115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11161184aebb761cbeac9124c37189a80a1a58f04b6bhkuang  // dump the magnitude by 4, hence the intermediate values are within
11171184aebb761cbeac9124c37189a80a1a58f04b6bhkuang  // the range of 16 bits.
11181184aebb761cbeac9124c37189a80a1a58f04b6bhkuang  if (round) {
11191184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[0] = half_round_shift(output[0]);
11201184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[1] = half_round_shift(output[1]);
11211184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[2] = half_round_shift(output[2]);
11221184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[3] = half_round_shift(output[3]);
11231184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[4] = half_round_shift(output[4]);
11241184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[5] = half_round_shift(output[5]);
11251184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[6] = half_round_shift(output[6]);
11261184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[7] = half_round_shift(output[7]);
11271184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[8] = half_round_shift(output[8]);
11281184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[9] = half_round_shift(output[9]);
11291184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[10] = half_round_shift(output[10]);
11301184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[11] = half_round_shift(output[11]);
11311184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[12] = half_round_shift(output[12]);
11321184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[13] = half_round_shift(output[13]);
11331184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[14] = half_round_shift(output[14]);
11341184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[15] = half_round_shift(output[15]);
11351184aebb761cbeac9124c37189a80a1a58f04b6bhkuang
11361184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[16] = half_round_shift(output[16]);
11371184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[17] = half_round_shift(output[17]);
11381184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[18] = half_round_shift(output[18]);
11391184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[19] = half_round_shift(output[19]);
11401184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[20] = half_round_shift(output[20]);
11411184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[21] = half_round_shift(output[21]);
11421184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[22] = half_round_shift(output[22]);
11431184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[23] = half_round_shift(output[23]);
11441184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[24] = half_round_shift(output[24]);
11451184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[25] = half_round_shift(output[25]);
11461184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[26] = half_round_shift(output[26]);
11471184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[27] = half_round_shift(output[27]);
11481184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[28] = half_round_shift(output[28]);
11491184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[29] = half_round_shift(output[29]);
11501184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[30] = half_round_shift(output[30]);
11511184aebb761cbeac9124c37189a80a1a58f04b6bhkuang    output[31] = half_round_shift(output[31]);
11521184aebb761cbeac9124c37189a80a1a58f04b6bhkuang  }
11531184aebb761cbeac9124c37189a80a1a58f04b6bhkuang
1154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 3
1155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[0] = output[0] + output[(8 - 1)];
1156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[1] = output[1] + output[(8 - 2)];
1157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[2] = output[2] + output[(8 - 3)];
1158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[3] = output[3] + output[(8 - 4)];
1159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[4] = -output[4] + output[(8 - 5)];
1160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[5] = -output[5] + output[(8 - 6)];
1161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[6] = -output[6] + output[(8 - 7)];
1162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[7] = -output[7] + output[(8 - 8)];
1163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[8] = output[8];
1164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[9] = output[9];
1165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
1166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
1167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
1168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
1169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[14] = output[14];
1170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[15] = output[15];
1171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[16] = output[16] + output[23];
1173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[17] = output[17] + output[22];
1174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[18] = output[18] + output[21];
1175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[19] = output[19] + output[20];
1176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[20] = -output[20] + output[19];
1177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[21] = -output[21] + output[18];
1178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[22] = -output[22] + output[17];
1179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[23] = -output[23] + output[16];
1180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[24] = -output[24] + output[31];
1181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[25] = -output[25] + output[30];
1182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[26] = -output[26] + output[29];
1183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[27] = -output[27] + output[28];
1184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[28] = output[28] + output[27];
1185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[29] = output[29] + output[26];
1186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[30] = output[30] + output[25];
1187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[31] = output[31] + output[24];
1188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 4
1190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] = step[0] + step[3];
1191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = step[1] + step[2];
1192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] = -step[2] + step[1];
1193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = -step[3] + step[0];
1194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[4] = step[4];
1195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
1196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
1197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[7] = step[7];
1198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[8] = step[8] + step[11];
1199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[9] = step[9] + step[10];
1200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[10] = -step[10] + step[9];
1201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[11] = -step[11] + step[8];
1202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[12] = -step[12] + step[15];
1203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[13] = -step[13] + step[14];
1204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[14] = step[14] + step[13];
1205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[15] = step[15] + step[12];
1206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[16] = step[16];
1208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[17] = step[17];
1209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
1210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
1211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
1212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
1213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[22] = step[22];
1214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[23] = step[23];
1215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[24] = step[24];
1216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[25] = step[25];
1217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
1218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
1219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
1220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
1221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[30] = step[30];
1222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[31] = step[31];
1223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 5
1225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
1226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
1227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
1228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
1229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[4] = output[4] + output[5];
1230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[5] = -output[5] + output[4];
1231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[6] = -output[6] + output[7];
1232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[7] = output[7] + output[6];
1233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[8] = output[8];
1234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
1235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
1236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[11] = output[11];
1237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[12] = output[12];
1238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
1239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
1240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[15] = output[15];
1241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[16] = output[16] + output[19];
1243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[17] = output[17] + output[18];
1244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[18] = -output[18] + output[17];
1245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[19] = -output[19] + output[16];
1246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[20] = -output[20] + output[23];
1247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[21] = -output[21] + output[22];
1248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[22] = output[22] + output[21];
1249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[23] = output[23] + output[20];
1250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[24] = output[24] + output[27];
1251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[25] = output[25] + output[26];
1252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[26] = -output[26] + output[25];
1253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[27] = -output[27] + output[24];
1254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[28] = -output[28] + output[31];
1255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[29] = -output[29] + output[30];
1256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[30] = output[30] + output[29];
1257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[31] = output[31] + output[28];
1258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 6
1260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0] = step[0];
1261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1] = step[1];
1262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2] = step[2];
1263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3] = step[3];
1264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
1265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
1266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
1267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
1268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[8] = step[8] + step[9];
1269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[9] = -step[9] + step[8];
1270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[10] = -step[10] + step[11];
1271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[11] = step[11] + step[10];
1272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[12] = step[12] + step[13];
1273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[13] = -step[13] + step[12];
1274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[14] = -step[14] + step[15];
1275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[15] = step[15] + step[14];
1276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[16] = step[16];
1278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
1279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
1280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[19] = step[19];
1281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[20] = step[20];
1282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
1283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
1284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[23] = step[23];
1285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[24] = step[24];
1286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
1287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
1288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[27] = step[27];
1289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[28] = step[28];
1290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
1291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
1292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[31] = step[31];
1293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Stage 7
1295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[0] = output[0];
1296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[1] = output[1];
1297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[2] = output[2];
1298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[3] = output[3];
1299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[4] = output[4];
1300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[5] = output[5];
1301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[6] = output[6];
1302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[7] = output[7];
1303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
1304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
1305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
1306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
1307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
1308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
1309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
1310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
1311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[16] = output[16] + output[17];
1313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[17] = -output[17] + output[16];
1314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[18] = -output[18] + output[19];
1315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[19] = output[19] + output[18];
1316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[20] = output[20] + output[21];
1317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[21] = -output[21] + output[20];
1318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[22] = -output[22] + output[23];
1319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[23] = output[23] + output[22];
1320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[24] = output[24] + output[25];
1321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[25] = -output[25] + output[24];
1322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[26] = -output[26] + output[27];
1323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[27] = output[27] + output[26];
1324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[28] = output[28] + output[29];
1325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[29] = -output[29] + output[28];
1326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[30] = -output[30] + output[31];
1327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  step[31] = output[31] + output[30];
1328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Final stage --- outputs indices are bit-reversed.
1330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[0]  = step[0];
1331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[16] = step[1];
1332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[8]  = step[2];
1333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[24] = step[3];
1334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[4]  = step[4];
1335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[20] = step[5];
1336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[12] = step[6];
1337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[28] = step[7];
1338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[2]  = step[8];
1339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[18] = step[9];
1340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[10] = step[10];
1341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[26] = step[11];
1342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[6]  = step[12];
1343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[22] = step[13];
1344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[14] = step[14];
1345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[30] = step[15];
1346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
1348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
1349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
1350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
1351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
1352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
1353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
1354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
1355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
1356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
1357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
1358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
1359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
1360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
1361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
1362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
1363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
1366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  int r, c;
1367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  int16_t sum = 0;
1368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  for (r = 0; r < 32; ++r)
1369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    for (c = 0; c < 32; ++c)
1370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      sum += input[r * stride + c];
1371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  output[0] = sum >> 3;
1373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  output[1] = 0;
1374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
13765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
1377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
1378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int output[32 * 32];
1379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns
1381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 32; ++i) {
1382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    int temp_in[32], temp_out[32];
1383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 32; ++j)
13845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      temp_in[j] = input[j * stride + i] * 4;
13852ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    fdct32(temp_in, temp_out, 0);
1386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 32; ++j)
1387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows
1391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 32; ++i) {
1392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    int temp_in[32], temp_out[32];
1393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 32; ++j)
1394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp_in[j] = output[j + i * 32];
13952ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    fdct32(temp_in, temp_out, 0);
1396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 32; ++j)
1397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
14012ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian// Note that although we use dct_32_round in dct32 computation flow,
1402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang// this 2d fdct32x32 for rate-distortion optimization loop is operating
1403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang// within 16 bits precision.
14045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
1405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int i, j;
1406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  int output[32 * 32];
1407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Columns
1409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 32; ++i) {
1410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    int temp_in[32], temp_out[32];
1411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 32; ++j)
14125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      temp_in[j] = input[j * stride + i] * 4;
14132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    fdct32(temp_in, temp_out, 0);
1414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 32; ++j)
141591037db265ecdd914a26e056cf69207b4f50924ehkuang      // TODO(cd): see quality impact of only doing
141691037db265ecdd914a26e056cf69207b4f50924ehkuang      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
141791037db265ecdd914a26e056cf69207b4f50924ehkuang      //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
1418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  // Rows
1422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  for (i = 0; i < 32; ++i) {
1423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    int temp_in[32], temp_out[32];
1424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 32; ++j)
1425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      temp_in[j] = output[j + i * 32];
14262ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    fdct32(temp_in, temp_out, 1);
1427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    for (j = 0; j < 32; ++j)
1428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang      out[j + i * 32] = temp_out[j];
1429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  }
1430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
1431