1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <math.h>
13
14#include "./vpx_config.h"
15#include "./vp9_rtcd.h"
16
17#include "vp9/common/vp9_blockd.h"
18#include "vp9/common/vp9_idct.h"
19#include "vp9/common/vp9_systemdependent.h"
20
21static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
22  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
23  // TODO(debargha, peter.derivaz): Find new bounds for this assert
24  // and make the bounds consts.
25  // assert(INT16_MIN <= rv && rv <= INT16_MAX);
26  return rv;
27}
28
29static void fdct4(const tran_low_t *input, tran_low_t *output) {
30  tran_high_t step[4];
31  tran_high_t temp1, temp2;
32
33  step[0] = input[0] + input[3];
34  step[1] = input[1] + input[2];
35  step[2] = input[1] - input[2];
36  step[3] = input[0] - input[3];
37
38  temp1 = (step[0] + step[1]) * cospi_16_64;
39  temp2 = (step[0] - step[1]) * cospi_16_64;
40  output[0] = fdct_round_shift(temp1);
41  output[2] = fdct_round_shift(temp2);
42  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
43  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
44  output[1] = fdct_round_shift(temp1);
45  output[3] = fdct_round_shift(temp2);
46}
47
48void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
49  int r, c;
50  tran_low_t sum = 0;
51  for (r = 0; r < 4; ++r)
52    for (c = 0; c < 4; ++c)
53      sum += input[r * stride + c];
54
55  output[0] = sum << 1;
56  output[1] = 0;
57}
58
59void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
60  // The 2D transform is done with two passes which are actually pretty
61  // similar. In the first one, we transform the columns and transpose
62  // the results. In the second one, we transform the rows. To achieve that,
63  // as the first pass results are transposed, we transpose the columns (that
64  // is the transposed rows) and transpose the results (so that it goes back
65  // in normal/row positions).
66  int pass;
67  // We need an intermediate buffer between passes.
68  tran_low_t intermediate[4 * 4];
69  const int16_t *in_pass0 = input;
70  const tran_low_t *in = NULL;
71  tran_low_t *out = intermediate;
72  // Do the two transform/transpose passes
73  for (pass = 0; pass < 2; ++pass) {
74    tran_high_t input[4];      // canbe16
75    tran_high_t step[4];       // canbe16
76    tran_high_t temp1, temp2;  // needs32
77    int i;
78    for (i = 0; i < 4; ++i) {
79      // Load inputs.
80      if (0 == pass) {
81        input[0] = in_pass0[0 * stride] * 16;
82        input[1] = in_pass0[1 * stride] * 16;
83        input[2] = in_pass0[2 * stride] * 16;
84        input[3] = in_pass0[3 * stride] * 16;
85        if (i == 0 && input[0]) {
86          input[0] += 1;
87        }
88      } else {
89        input[0] = in[0 * 4];
90        input[1] = in[1 * 4];
91        input[2] = in[2 * 4];
92        input[3] = in[3 * 4];
93      }
94      // Transform.
95      step[0] = input[0] + input[3];
96      step[1] = input[1] + input[2];
97      step[2] = input[1] - input[2];
98      step[3] = input[0] - input[3];
99      temp1 = (step[0] + step[1]) * cospi_16_64;
100      temp2 = (step[0] - step[1]) * cospi_16_64;
101      out[0] = fdct_round_shift(temp1);
102      out[2] = fdct_round_shift(temp2);
103      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
104      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
105      out[1] = fdct_round_shift(temp1);
106      out[3] = fdct_round_shift(temp2);
107      // Do next column (which is a transposed row in second/horizontal pass)
108      in_pass0++;
109      in++;
110      out += 4;
111    }
112    // Setup in/out for next pass.
113    in = intermediate;
114    out = output;
115  }
116
117  {
118    int i, j;
119    for (i = 0; i < 4; ++i) {
120      for (j = 0; j < 4; ++j)
121        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
122    }
123  }
124}
125
126static void fadst4(const tran_low_t *input, tran_low_t *output) {
127  tran_high_t x0, x1, x2, x3;
128  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
129
130  x0 = input[0];
131  x1 = input[1];
132  x2 = input[2];
133  x3 = input[3];
134
135  if (!(x0 | x1 | x2 | x3)) {
136    output[0] = output[1] = output[2] = output[3] = 0;
137    return;
138  }
139
140  s0 = sinpi_1_9 * x0;
141  s1 = sinpi_4_9 * x0;
142  s2 = sinpi_2_9 * x1;
143  s3 = sinpi_1_9 * x1;
144  s4 = sinpi_3_9 * x2;
145  s5 = sinpi_4_9 * x3;
146  s6 = sinpi_2_9 * x3;
147  s7 = x0 + x1 - x3;
148
149  x0 = s0 + s2 + s5;
150  x1 = sinpi_3_9 * s7;
151  x2 = s1 - s3 + s6;
152  x3 = s4;
153
154  s0 = x0 + x3;
155  s1 = x1;
156  s2 = x2 - x3;
157  s3 = x2 - x0 + x3;
158
159  // 1-D transform scaling factor is sqrt(2).
160  output[0] = fdct_round_shift(s0);
161  output[1] = fdct_round_shift(s1);
162  output[2] = fdct_round_shift(s2);
163  output[3] = fdct_round_shift(s3);
164}
165
166static const transform_2d FHT_4[] = {
167  { fdct4,  fdct4  },  // DCT_DCT  = 0
168  { fadst4, fdct4  },  // ADST_DCT = 1
169  { fdct4,  fadst4 },  // DCT_ADST = 2
170  { fadst4, fadst4 }   // ADST_ADST = 3
171};
172
173void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
174                  int stride, int tx_type) {
175  if (tx_type == DCT_DCT) {
176    vp9_fdct4x4_c(input, output, stride);
177  } else {
178    tran_low_t out[4 * 4];
179    tran_low_t *outptr = &out[0];
180    int i, j;
181    tran_low_t temp_in[4], temp_out[4];
182    const transform_2d ht = FHT_4[tx_type];
183
184    // Columns
185    for (i = 0; i < 4; ++i) {
186      for (j = 0; j < 4; ++j)
187        temp_in[j] = input[j * stride + i] * 16;
188      if (i == 0 && temp_in[0])
189        temp_in[0] += 1;
190      ht.cols(temp_in, temp_out);
191      for (j = 0; j < 4; ++j)
192        outptr[j * 4 + i] = temp_out[j];
193    }
194
195    // Rows
196    for (i = 0; i < 4; ++i) {
197      for (j = 0; j < 4; ++j)
198        temp_in[j] = out[j + i * 4];
199      ht.rows(temp_in, temp_out);
200      for (j = 0; j < 4; ++j)
201        output[j + i * 4] = (temp_out[j] + 1) >> 2;
202    }
203  }
204}
205
206static void fdct8(const tran_low_t *input, tran_low_t *output) {
207  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
208  tran_high_t t0, t1, t2, t3;                  // needs32
209  tran_high_t x0, x1, x2, x3;                  // canbe16
210
211  // stage 1
212  s0 = input[0] + input[7];
213  s1 = input[1] + input[6];
214  s2 = input[2] + input[5];
215  s3 = input[3] + input[4];
216  s4 = input[3] - input[4];
217  s5 = input[2] - input[5];
218  s6 = input[1] - input[6];
219  s7 = input[0] - input[7];
220
221  // fdct4(step, step);
222  x0 = s0 + s3;
223  x1 = s1 + s2;
224  x2 = s1 - s2;
225  x3 = s0 - s3;
226  t0 = (x0 + x1) * cospi_16_64;
227  t1 = (x0 - x1) * cospi_16_64;
228  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
229  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
230  output[0] = fdct_round_shift(t0);
231  output[2] = fdct_round_shift(t2);
232  output[4] = fdct_round_shift(t1);
233  output[6] = fdct_round_shift(t3);
234
235  // Stage 2
236  t0 = (s6 - s5) * cospi_16_64;
237  t1 = (s6 + s5) * cospi_16_64;
238  t2 = fdct_round_shift(t0);
239  t3 = fdct_round_shift(t1);
240
241  // Stage 3
242  x0 = s4 + t2;
243  x1 = s4 - t2;
244  x2 = s7 - t3;
245  x3 = s7 + t3;
246
247  // Stage 4
248  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
249  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
250  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
251  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
252  output[1] = fdct_round_shift(t0);
253  output[3] = fdct_round_shift(t2);
254  output[5] = fdct_round_shift(t1);
255  output[7] = fdct_round_shift(t3);
256}
257
258void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
259  int r, c;
260  tran_low_t sum = 0;
261  for (r = 0; r < 8; ++r)
262    for (c = 0; c < 8; ++c)
263      sum += input[r * stride + c];
264
265  output[0] = sum;
266  output[1] = 0;
267}
268
269void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
270  int i, j;
271  tran_low_t intermediate[64];
272
273  // Transform columns
274  {
275    tran_low_t *output = intermediate;
276    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
277    tran_high_t t0, t1, t2, t3;                  // needs32
278    tran_high_t x0, x1, x2, x3;                  // canbe16
279
280    int i;
281    for (i = 0; i < 8; i++) {
282      // stage 1
283      s0 = (input[0 * stride] + input[7 * stride]) * 4;
284      s1 = (input[1 * stride] + input[6 * stride]) * 4;
285      s2 = (input[2 * stride] + input[5 * stride]) * 4;
286      s3 = (input[3 * stride] + input[4 * stride]) * 4;
287      s4 = (input[3 * stride] - input[4 * stride]) * 4;
288      s5 = (input[2 * stride] - input[5 * stride]) * 4;
289      s6 = (input[1 * stride] - input[6 * stride]) * 4;
290      s7 = (input[0 * stride] - input[7 * stride]) * 4;
291
292      // fdct4(step, step);
293      x0 = s0 + s3;
294      x1 = s1 + s2;
295      x2 = s1 - s2;
296      x3 = s0 - s3;
297      t0 = (x0 + x1) * cospi_16_64;
298      t1 = (x0 - x1) * cospi_16_64;
299      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
300      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
301      output[0 * 8] = fdct_round_shift(t0);
302      output[2 * 8] = fdct_round_shift(t2);
303      output[4 * 8] = fdct_round_shift(t1);
304      output[6 * 8] = fdct_round_shift(t3);
305
306      // Stage 2
307      t0 = (s6 - s5) * cospi_16_64;
308      t1 = (s6 + s5) * cospi_16_64;
309      t2 = fdct_round_shift(t0);
310      t3 = fdct_round_shift(t1);
311
312      // Stage 3
313      x0 = s4 + t2;
314      x1 = s4 - t2;
315      x2 = s7 - t3;
316      x3 = s7 + t3;
317
318      // Stage 4
319      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
320      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
321      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
322      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
323      output[1 * 8] = fdct_round_shift(t0);
324      output[3 * 8] = fdct_round_shift(t2);
325      output[5 * 8] = fdct_round_shift(t1);
326      output[7 * 8] = fdct_round_shift(t3);
327      input++;
328      output++;
329    }
330  }
331
332  // Rows
333  for (i = 0; i < 8; ++i) {
334    fdct8(&intermediate[i * 8], &final_output[i * 8]);
335    for (j = 0; j < 8; ++j)
336      final_output[j + i * 8] /= 2;
337  }
338}
339
340void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
341  int r, c;
342  tran_low_t sum = 0;
343  for (r = 0; r < 16; ++r)
344    for (c = 0; c < 16; ++c)
345      sum += input[r * stride + c];
346
347  output[0] = sum >> 1;
348  output[1] = 0;
349}
350
351void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
352  // The 2D transform is done with two passes which are actually pretty
353  // similar. In the first one, we transform the columns and transpose
354  // the results. In the second one, we transform the rows. To achieve that,
355  // as the first pass results are transposed, we transpose the columns (that
356  // is the transposed rows) and transpose the results (so that it goes back
357  // in normal/row positions).
358  int pass;
359  // We need an intermediate buffer between passes.
360  tran_low_t intermediate[256];
361  const int16_t *in_pass0 = input;
362  const tran_low_t *in = NULL;
363  tran_low_t *out = intermediate;
364  // Do the two transform/transpose passes
365  for (pass = 0; pass < 2; ++pass) {
366    tran_high_t step1[8];      // canbe16
367    tran_high_t step2[8];      // canbe16
368    tran_high_t step3[8];      // canbe16
369    tran_high_t input[8];      // canbe16
370    tran_high_t temp1, temp2;  // needs32
371    int i;
372    for (i = 0; i < 16; i++) {
373      if (0 == pass) {
374        // Calculate input for the first 8 results.
375        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
376        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
377        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
378        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
379        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
380        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
381        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
382        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
383        // Calculate input for the next 8 results.
384        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
385        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
386        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
387        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
388        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
389        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
390        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
391        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
392      } else {
393        // Calculate input for the first 8 results.
394        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
395        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
396        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
397        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
398        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
399        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
400        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
401        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
402        // Calculate input for the next 8 results.
403        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
404        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
405        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
406        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
407        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
408        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
409        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
410        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
411      }
412      // Work on the first eight values; fdct8(input, even_results);
413      {
414        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
415        tran_high_t t0, t1, t2, t3;                  // needs32
416        tran_high_t x0, x1, x2, x3;                  // canbe16
417
418        // stage 1
419        s0 = input[0] + input[7];
420        s1 = input[1] + input[6];
421        s2 = input[2] + input[5];
422        s3 = input[3] + input[4];
423        s4 = input[3] - input[4];
424        s5 = input[2] - input[5];
425        s6 = input[1] - input[6];
426        s7 = input[0] - input[7];
427
428        // fdct4(step, step);
429        x0 = s0 + s3;
430        x1 = s1 + s2;
431        x2 = s1 - s2;
432        x3 = s0 - s3;
433        t0 = (x0 + x1) * cospi_16_64;
434        t1 = (x0 - x1) * cospi_16_64;
435        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
436        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
437        out[0] = fdct_round_shift(t0);
438        out[4] = fdct_round_shift(t2);
439        out[8] = fdct_round_shift(t1);
440        out[12] = fdct_round_shift(t3);
441
442        // Stage 2
443        t0 = (s6 - s5) * cospi_16_64;
444        t1 = (s6 + s5) * cospi_16_64;
445        t2 = fdct_round_shift(t0);
446        t3 = fdct_round_shift(t1);
447
448        // Stage 3
449        x0 = s4 + t2;
450        x1 = s4 - t2;
451        x2 = s7 - t3;
452        x3 = s7 + t3;
453
454        // Stage 4
455        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
456        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
457        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
458        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
459        out[2] = fdct_round_shift(t0);
460        out[6] = fdct_round_shift(t2);
461        out[10] = fdct_round_shift(t1);
462        out[14] = fdct_round_shift(t3);
463      }
464      // Work on the next eight values; step1 -> odd_results
465      {
466        // step 2
467        temp1 = (step1[5] - step1[2]) * cospi_16_64;
468        temp2 = (step1[4] - step1[3]) * cospi_16_64;
469        step2[2] = fdct_round_shift(temp1);
470        step2[3] = fdct_round_shift(temp2);
471        temp1 = (step1[4] + step1[3]) * cospi_16_64;
472        temp2 = (step1[5] + step1[2]) * cospi_16_64;
473        step2[4] = fdct_round_shift(temp1);
474        step2[5] = fdct_round_shift(temp2);
475        // step 3
476        step3[0] = step1[0] + step2[3];
477        step3[1] = step1[1] + step2[2];
478        step3[2] = step1[1] - step2[2];
479        step3[3] = step1[0] - step2[3];
480        step3[4] = step1[7] - step2[4];
481        step3[5] = step1[6] - step2[5];
482        step3[6] = step1[6] + step2[5];
483        step3[7] = step1[7] + step2[4];
484        // step 4
485        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
486        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
487        step2[1] = fdct_round_shift(temp1);
488        step2[2] = fdct_round_shift(temp2);
489        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
490        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
491        step2[5] = fdct_round_shift(temp1);
492        step2[6] = fdct_round_shift(temp2);
493        // step 5
494        step1[0] = step3[0] + step2[1];
495        step1[1] = step3[0] - step2[1];
496        step1[2] = step3[3] + step2[2];
497        step1[3] = step3[3] - step2[2];
498        step1[4] = step3[4] - step2[5];
499        step1[5] = step3[4] + step2[5];
500        step1[6] = step3[7] - step2[6];
501        step1[7] = step3[7] + step2[6];
502        // step 6
503        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
504        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
505        out[1] = fdct_round_shift(temp1);
506        out[9] = fdct_round_shift(temp2);
507        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
508        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
509        out[5] = fdct_round_shift(temp1);
510        out[13] = fdct_round_shift(temp2);
511        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
512        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
513        out[3] = fdct_round_shift(temp1);
514        out[11] = fdct_round_shift(temp2);
515        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
516        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
517        out[7] = fdct_round_shift(temp1);
518        out[15] = fdct_round_shift(temp2);
519      }
520      // Do next column (which is a transposed row in second/horizontal pass)
521      in++;
522      in_pass0++;
523      out += 16;
524    }
525    // Setup in/out for next pass.
526    in = intermediate;
527    out = output;
528  }
529}
530
531static void fadst8(const tran_low_t *input, tran_low_t *output) {
532  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
533
534  tran_high_t x0 = input[7];
535  tran_high_t x1 = input[0];
536  tran_high_t x2 = input[5];
537  tran_high_t x3 = input[2];
538  tran_high_t x4 = input[3];
539  tran_high_t x5 = input[4];
540  tran_high_t x6 = input[1];
541  tran_high_t x7 = input[6];
542
543  // stage 1
544  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
545  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
546  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
547  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
548  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
549  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
550  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
551  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
552
553  x0 = fdct_round_shift(s0 + s4);
554  x1 = fdct_round_shift(s1 + s5);
555  x2 = fdct_round_shift(s2 + s6);
556  x3 = fdct_round_shift(s3 + s7);
557  x4 = fdct_round_shift(s0 - s4);
558  x5 = fdct_round_shift(s1 - s5);
559  x6 = fdct_round_shift(s2 - s6);
560  x7 = fdct_round_shift(s3 - s7);
561
562  // stage 2
563  s0 = x0;
564  s1 = x1;
565  s2 = x2;
566  s3 = x3;
567  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
568  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
569  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
570  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
571
572  x0 = s0 + s2;
573  x1 = s1 + s3;
574  x2 = s0 - s2;
575  x3 = s1 - s3;
576  x4 = fdct_round_shift(s4 + s6);
577  x5 = fdct_round_shift(s5 + s7);
578  x6 = fdct_round_shift(s4 - s6);
579  x7 = fdct_round_shift(s5 - s7);
580
581  // stage 3
582  s2 = cospi_16_64 * (x2 + x3);
583  s3 = cospi_16_64 * (x2 - x3);
584  s6 = cospi_16_64 * (x6 + x7);
585  s7 = cospi_16_64 * (x6 - x7);
586
587  x2 = fdct_round_shift(s2);
588  x3 = fdct_round_shift(s3);
589  x6 = fdct_round_shift(s6);
590  x7 = fdct_round_shift(s7);
591
592  output[0] =   x0;
593  output[1] = - x4;
594  output[2] =   x6;
595  output[3] = - x2;
596  output[4] =   x3;
597  output[5] = - x7;
598  output[6] =   x5;
599  output[7] = - x1;
600}
601
602static const transform_2d FHT_8[] = {
603  { fdct8,  fdct8  },  // DCT_DCT  = 0
604  { fadst8, fdct8  },  // ADST_DCT = 1
605  { fdct8,  fadst8 },  // DCT_ADST = 2
606  { fadst8, fadst8 }   // ADST_ADST = 3
607};
608
609void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
610                  int stride, int tx_type) {
611  if (tx_type == DCT_DCT) {
612    vp9_fdct8x8_c(input, output, stride);
613  } else {
614    tran_low_t out[64];
615    tran_low_t *outptr = &out[0];
616    int i, j;
617    tran_low_t temp_in[8], temp_out[8];
618    const transform_2d ht = FHT_8[tx_type];
619
620    // Columns
621    for (i = 0; i < 8; ++i) {
622      for (j = 0; j < 8; ++j)
623        temp_in[j] = input[j * stride + i] * 4;
624      ht.cols(temp_in, temp_out);
625      for (j = 0; j < 8; ++j)
626        outptr[j * 8 + i] = temp_out[j];
627    }
628
629    // Rows
630    for (i = 0; i < 8; ++i) {
631      for (j = 0; j < 8; ++j)
632        temp_in[j] = out[j + i * 8];
633      ht.rows(temp_in, temp_out);
634      for (j = 0; j < 8; ++j)
635        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
636    }
637  }
638}
639
640/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
641   pixel. */
642void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
643  int i;
644  tran_high_t a1, b1, c1, d1, e1;
645  const int16_t *ip_pass0 = input;
646  const tran_low_t *ip = NULL;
647  tran_low_t *op = output;
648
649  for (i = 0; i < 4; i++) {
650    a1 = ip_pass0[0 * stride];
651    b1 = ip_pass0[1 * stride];
652    c1 = ip_pass0[2 * stride];
653    d1 = ip_pass0[3 * stride];
654
655    a1 += b1;
656    d1 = d1 - c1;
657    e1 = (a1 - d1) >> 1;
658    b1 = e1 - b1;
659    c1 = e1 - c1;
660    a1 -= c1;
661    d1 += b1;
662    op[0] = a1;
663    op[4] = c1;
664    op[8] = d1;
665    op[12] = b1;
666
667    ip_pass0++;
668    op++;
669  }
670  ip = output;
671  op = output;
672
673  for (i = 0; i < 4; i++) {
674    a1 = ip[0];
675    b1 = ip[1];
676    c1 = ip[2];
677    d1 = ip[3];
678
679    a1 += b1;
680    d1 -= c1;
681    e1 = (a1 - d1) >> 1;
682    b1 = e1 - b1;
683    c1 = e1 - c1;
684    a1 -= c1;
685    d1 += b1;
686    op[0] = a1 * UNIT_QUANT_FACTOR;
687    op[1] = c1 * UNIT_QUANT_FACTOR;
688    op[2] = d1 * UNIT_QUANT_FACTOR;
689    op[3] = b1 * UNIT_QUANT_FACTOR;
690
691    ip += 4;
692    op += 4;
693  }
694}
695
696// Rewrote to use same algorithm as others.
697static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
698  tran_high_t step1[8];      // canbe16
699  tran_high_t step2[8];      // canbe16
700  tran_high_t step3[8];      // canbe16
701  tran_high_t input[8];      // canbe16
702  tran_high_t temp1, temp2;  // needs32
703
704  // step 1
705  input[0] = in[0] + in[15];
706  input[1] = in[1] + in[14];
707  input[2] = in[2] + in[13];
708  input[3] = in[3] + in[12];
709  input[4] = in[4] + in[11];
710  input[5] = in[5] + in[10];
711  input[6] = in[6] + in[ 9];
712  input[7] = in[7] + in[ 8];
713
714  step1[0] = in[7] - in[ 8];
715  step1[1] = in[6] - in[ 9];
716  step1[2] = in[5] - in[10];
717  step1[3] = in[4] - in[11];
718  step1[4] = in[3] - in[12];
719  step1[5] = in[2] - in[13];
720  step1[6] = in[1] - in[14];
721  step1[7] = in[0] - in[15];
722
723  // fdct8(step, step);
724  {
725    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
726    tran_high_t t0, t1, t2, t3;                  // needs32
727    tran_high_t x0, x1, x2, x3;                  // canbe16
728
729    // stage 1
730    s0 = input[0] + input[7];
731    s1 = input[1] + input[6];
732    s2 = input[2] + input[5];
733    s3 = input[3] + input[4];
734    s4 = input[3] - input[4];
735    s5 = input[2] - input[5];
736    s6 = input[1] - input[6];
737    s7 = input[0] - input[7];
738
739    // fdct4(step, step);
740    x0 = s0 + s3;
741    x1 = s1 + s2;
742    x2 = s1 - s2;
743    x3 = s0 - s3;
744    t0 = (x0 + x1) * cospi_16_64;
745    t1 = (x0 - x1) * cospi_16_64;
746    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
747    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
748    out[0] = fdct_round_shift(t0);
749    out[4] = fdct_round_shift(t2);
750    out[8] = fdct_round_shift(t1);
751    out[12] = fdct_round_shift(t3);
752
753    // Stage 2
754    t0 = (s6 - s5) * cospi_16_64;
755    t1 = (s6 + s5) * cospi_16_64;
756    t2 = fdct_round_shift(t0);
757    t3 = fdct_round_shift(t1);
758
759    // Stage 3
760    x0 = s4 + t2;
761    x1 = s4 - t2;
762    x2 = s7 - t3;
763    x3 = s7 + t3;
764
765    // Stage 4
766    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
767    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
768    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
769    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
770    out[2] = fdct_round_shift(t0);
771    out[6] = fdct_round_shift(t2);
772    out[10] = fdct_round_shift(t1);
773    out[14] = fdct_round_shift(t3);
774  }
775
776  // step 2
777  temp1 = (step1[5] - step1[2]) * cospi_16_64;
778  temp2 = (step1[4] - step1[3]) * cospi_16_64;
779  step2[2] = fdct_round_shift(temp1);
780  step2[3] = fdct_round_shift(temp2);
781  temp1 = (step1[4] + step1[3]) * cospi_16_64;
782  temp2 = (step1[5] + step1[2]) * cospi_16_64;
783  step2[4] = fdct_round_shift(temp1);
784  step2[5] = fdct_round_shift(temp2);
785
786  // step 3
787  step3[0] = step1[0] + step2[3];
788  step3[1] = step1[1] + step2[2];
789  step3[2] = step1[1] - step2[2];
790  step3[3] = step1[0] - step2[3];
791  step3[4] = step1[7] - step2[4];
792  step3[5] = step1[6] - step2[5];
793  step3[6] = step1[6] + step2[5];
794  step3[7] = step1[7] + step2[4];
795
796  // step 4
797  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
798  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
799  step2[1] = fdct_round_shift(temp1);
800  step2[2] = fdct_round_shift(temp2);
801  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
802  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
803  step2[5] = fdct_round_shift(temp1);
804  step2[6] = fdct_round_shift(temp2);
805
806  // step 5
807  step1[0] = step3[0] + step2[1];
808  step1[1] = step3[0] - step2[1];
809  step1[2] = step3[3] + step2[2];
810  step1[3] = step3[3] - step2[2];
811  step1[4] = step3[4] - step2[5];
812  step1[5] = step3[4] + step2[5];
813  step1[6] = step3[7] - step2[6];
814  step1[7] = step3[7] + step2[6];
815
816  // step 6
817  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
818  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
819  out[1] = fdct_round_shift(temp1);
820  out[9] = fdct_round_shift(temp2);
821
822  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
823  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
824  out[5] = fdct_round_shift(temp1);
825  out[13] = fdct_round_shift(temp2);
826
827  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
828  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
829  out[3] = fdct_round_shift(temp1);
830  out[11] = fdct_round_shift(temp2);
831
832  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
833  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
834  out[7] = fdct_round_shift(temp1);
835  out[15] = fdct_round_shift(temp2);
836}
837
838static void fadst16(const tran_low_t *input, tran_low_t *output) {
839  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
840  tran_high_t s9, s10, s11, s12, s13, s14, s15;
841
842  tran_high_t x0 = input[15];
843  tran_high_t x1 = input[0];
844  tran_high_t x2 = input[13];
845  tran_high_t x3 = input[2];
846  tran_high_t x4 = input[11];
847  tran_high_t x5 = input[4];
848  tran_high_t x6 = input[9];
849  tran_high_t x7 = input[6];
850  tran_high_t x8 = input[7];
851  tran_high_t x9 = input[8];
852  tran_high_t x10 = input[5];
853  tran_high_t x11 = input[10];
854  tran_high_t x12 = input[3];
855  tran_high_t x13 = input[12];
856  tran_high_t x14 = input[1];
857  tran_high_t x15 = input[14];
858
859  // stage 1
860  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
861  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
862  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
863  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
864  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
865  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
866  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
867  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
868  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
869  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
870  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
871  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
872  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
873  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
874  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
875  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
876
877  x0 = fdct_round_shift(s0 + s8);
878  x1 = fdct_round_shift(s1 + s9);
879  x2 = fdct_round_shift(s2 + s10);
880  x3 = fdct_round_shift(s3 + s11);
881  x4 = fdct_round_shift(s4 + s12);
882  x5 = fdct_round_shift(s5 + s13);
883  x6 = fdct_round_shift(s6 + s14);
884  x7 = fdct_round_shift(s7 + s15);
885  x8  = fdct_round_shift(s0 - s8);
886  x9  = fdct_round_shift(s1 - s9);
887  x10 = fdct_round_shift(s2 - s10);
888  x11 = fdct_round_shift(s3 - s11);
889  x12 = fdct_round_shift(s4 - s12);
890  x13 = fdct_round_shift(s5 - s13);
891  x14 = fdct_round_shift(s6 - s14);
892  x15 = fdct_round_shift(s7 - s15);
893
894  // stage 2
895  s0 = x0;
896  s1 = x1;
897  s2 = x2;
898  s3 = x3;
899  s4 = x4;
900  s5 = x5;
901  s6 = x6;
902  s7 = x7;
903  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
904  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
905  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
906  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
907  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
908  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
909  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
910  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
911
912  x0 = s0 + s4;
913  x1 = s1 + s5;
914  x2 = s2 + s6;
915  x3 = s3 + s7;
916  x4 = s0 - s4;
917  x5 = s1 - s5;
918  x6 = s2 - s6;
919  x7 = s3 - s7;
920  x8 = fdct_round_shift(s8 + s12);
921  x9 = fdct_round_shift(s9 + s13);
922  x10 = fdct_round_shift(s10 + s14);
923  x11 = fdct_round_shift(s11 + s15);
924  x12 = fdct_round_shift(s8 - s12);
925  x13 = fdct_round_shift(s9 - s13);
926  x14 = fdct_round_shift(s10 - s14);
927  x15 = fdct_round_shift(s11 - s15);
928
929  // stage 3
930  s0 = x0;
931  s1 = x1;
932  s2 = x2;
933  s3 = x3;
934  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
935  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
936  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
937  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
938  s8 = x8;
939  s9 = x9;
940  s10 = x10;
941  s11 = x11;
942  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
943  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
944  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
945  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
946
947  x0 = s0 + s2;
948  x1 = s1 + s3;
949  x2 = s0 - s2;
950  x3 = s1 - s3;
951  x4 = fdct_round_shift(s4 + s6);
952  x5 = fdct_round_shift(s5 + s7);
953  x6 = fdct_round_shift(s4 - s6);
954  x7 = fdct_round_shift(s5 - s7);
955  x8 = s8 + s10;
956  x9 = s9 + s11;
957  x10 = s8 - s10;
958  x11 = s9 - s11;
959  x12 = fdct_round_shift(s12 + s14);
960  x13 = fdct_round_shift(s13 + s15);
961  x14 = fdct_round_shift(s12 - s14);
962  x15 = fdct_round_shift(s13 - s15);
963
964  // stage 4
965  s2 = (- cospi_16_64) * (x2 + x3);
966  s3 = cospi_16_64 * (x2 - x3);
967  s6 = cospi_16_64 * (x6 + x7);
968  s7 = cospi_16_64 * (- x6 + x7);
969  s10 = cospi_16_64 * (x10 + x11);
970  s11 = cospi_16_64 * (- x10 + x11);
971  s14 = (- cospi_16_64) * (x14 + x15);
972  s15 = cospi_16_64 * (x14 - x15);
973
974  x2 = fdct_round_shift(s2);
975  x3 = fdct_round_shift(s3);
976  x6 = fdct_round_shift(s6);
977  x7 = fdct_round_shift(s7);
978  x10 = fdct_round_shift(s10);
979  x11 = fdct_round_shift(s11);
980  x14 = fdct_round_shift(s14);
981  x15 = fdct_round_shift(s15);
982
983  output[0] = x0;
984  output[1] = - x8;
985  output[2] = x12;
986  output[3] = - x4;
987  output[4] = x6;
988  output[5] = x14;
989  output[6] = x10;
990  output[7] = x2;
991  output[8] = x3;
992  output[9] =  x11;
993  output[10] = x15;
994  output[11] = x7;
995  output[12] = x5;
996  output[13] = - x13;
997  output[14] = x9;
998  output[15] = - x1;
999}
1000
1001static const transform_2d FHT_16[] = {
1002  { fdct16,  fdct16  },  // DCT_DCT  = 0
1003  { fadst16, fdct16  },  // ADST_DCT = 1
1004  { fdct16,  fadst16 },  // DCT_ADST = 2
1005  { fadst16, fadst16 }   // ADST_ADST = 3
1006};
1007
1008void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
1009                    int stride, int tx_type) {
1010  if (tx_type == DCT_DCT) {
1011    vp9_fdct16x16_c(input, output, stride);
1012  } else {
1013    tran_low_t out[256];
1014    tran_low_t *outptr = &out[0];
1015    int i, j;
1016    tran_low_t temp_in[16], temp_out[16];
1017    const transform_2d ht = FHT_16[tx_type];
1018
1019    // Columns
1020    for (i = 0; i < 16; ++i) {
1021      for (j = 0; j < 16; ++j)
1022        temp_in[j] = input[j * stride + i] * 4;
1023      ht.cols(temp_in, temp_out);
1024      for (j = 0; j < 16; ++j)
1025        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1026    }
1027
1028    // Rows
1029    for (i = 0; i < 16; ++i) {
1030      for (j = 0; j < 16; ++j)
1031        temp_in[j] = out[j + i * 16];
1032      ht.rows(temp_in, temp_out);
1033      for (j = 0; j < 16; ++j)
1034        output[j + i * 16] = temp_out[j];
1035    }
1036  }
1037}
1038
1039static INLINE tran_high_t dct_32_round(tran_high_t input) {
1040  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
1041  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
1042  // and make the bounds consts.
1043  // assert(-131072 <= rv && rv <= 131071);
1044  return rv;
1045}
1046
1047static INLINE tran_high_t half_round_shift(tran_high_t input) {
1048  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
1049  return rv;
1050}
1051
1052static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
1053  tran_high_t step[32];
1054  // Stage 1
1055  step[0] = input[0] + input[(32 - 1)];
1056  step[1] = input[1] + input[(32 - 2)];
1057  step[2] = input[2] + input[(32 - 3)];
1058  step[3] = input[3] + input[(32 - 4)];
1059  step[4] = input[4] + input[(32 - 5)];
1060  step[5] = input[5] + input[(32 - 6)];
1061  step[6] = input[6] + input[(32 - 7)];
1062  step[7] = input[7] + input[(32 - 8)];
1063  step[8] = input[8] + input[(32 - 9)];
1064  step[9] = input[9] + input[(32 - 10)];
1065  step[10] = input[10] + input[(32 - 11)];
1066  step[11] = input[11] + input[(32 - 12)];
1067  step[12] = input[12] + input[(32 - 13)];
1068  step[13] = input[13] + input[(32 - 14)];
1069  step[14] = input[14] + input[(32 - 15)];
1070  step[15] = input[15] + input[(32 - 16)];
1071  step[16] = -input[16] + input[(32 - 17)];
1072  step[17] = -input[17] + input[(32 - 18)];
1073  step[18] = -input[18] + input[(32 - 19)];
1074  step[19] = -input[19] + input[(32 - 20)];
1075  step[20] = -input[20] + input[(32 - 21)];
1076  step[21] = -input[21] + input[(32 - 22)];
1077  step[22] = -input[22] + input[(32 - 23)];
1078  step[23] = -input[23] + input[(32 - 24)];
1079  step[24] = -input[24] + input[(32 - 25)];
1080  step[25] = -input[25] + input[(32 - 26)];
1081  step[26] = -input[26] + input[(32 - 27)];
1082  step[27] = -input[27] + input[(32 - 28)];
1083  step[28] = -input[28] + input[(32 - 29)];
1084  step[29] = -input[29] + input[(32 - 30)];
1085  step[30] = -input[30] + input[(32 - 31)];
1086  step[31] = -input[31] + input[(32 - 32)];
1087
1088  // Stage 2
1089  output[0] = step[0] + step[16 - 1];
1090  output[1] = step[1] + step[16 - 2];
1091  output[2] = step[2] + step[16 - 3];
1092  output[3] = step[3] + step[16 - 4];
1093  output[4] = step[4] + step[16 - 5];
1094  output[5] = step[5] + step[16 - 6];
1095  output[6] = step[6] + step[16 - 7];
1096  output[7] = step[7] + step[16 - 8];
1097  output[8] = -step[8] + step[16 - 9];
1098  output[9] = -step[9] + step[16 - 10];
1099  output[10] = -step[10] + step[16 - 11];
1100  output[11] = -step[11] + step[16 - 12];
1101  output[12] = -step[12] + step[16 - 13];
1102  output[13] = -step[13] + step[16 - 14];
1103  output[14] = -step[14] + step[16 - 15];
1104  output[15] = -step[15] + step[16 - 16];
1105
1106  output[16] = step[16];
1107  output[17] = step[17];
1108  output[18] = step[18];
1109  output[19] = step[19];
1110
1111  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
1112  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
1113  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
1114  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
1115
1116  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
1117  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
1118  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
1119  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
1120
1121  output[28] = step[28];
1122  output[29] = step[29];
1123  output[30] = step[30];
1124  output[31] = step[31];
1125
1126  // dump the magnitude by 4, hence the intermediate values are within
1127  // the range of 16 bits.
1128  if (round) {
1129    output[0] = half_round_shift(output[0]);
1130    output[1] = half_round_shift(output[1]);
1131    output[2] = half_round_shift(output[2]);
1132    output[3] = half_round_shift(output[3]);
1133    output[4] = half_round_shift(output[4]);
1134    output[5] = half_round_shift(output[5]);
1135    output[6] = half_round_shift(output[6]);
1136    output[7] = half_round_shift(output[7]);
1137    output[8] = half_round_shift(output[8]);
1138    output[9] = half_round_shift(output[9]);
1139    output[10] = half_round_shift(output[10]);
1140    output[11] = half_round_shift(output[11]);
1141    output[12] = half_round_shift(output[12]);
1142    output[13] = half_round_shift(output[13]);
1143    output[14] = half_round_shift(output[14]);
1144    output[15] = half_round_shift(output[15]);
1145
1146    output[16] = half_round_shift(output[16]);
1147    output[17] = half_round_shift(output[17]);
1148    output[18] = half_round_shift(output[18]);
1149    output[19] = half_round_shift(output[19]);
1150    output[20] = half_round_shift(output[20]);
1151    output[21] = half_round_shift(output[21]);
1152    output[22] = half_round_shift(output[22]);
1153    output[23] = half_round_shift(output[23]);
1154    output[24] = half_round_shift(output[24]);
1155    output[25] = half_round_shift(output[25]);
1156    output[26] = half_round_shift(output[26]);
1157    output[27] = half_round_shift(output[27]);
1158    output[28] = half_round_shift(output[28]);
1159    output[29] = half_round_shift(output[29]);
1160    output[30] = half_round_shift(output[30]);
1161    output[31] = half_round_shift(output[31]);
1162  }
1163
1164  // Stage 3
1165  step[0] = output[0] + output[(8 - 1)];
1166  step[1] = output[1] + output[(8 - 2)];
1167  step[2] = output[2] + output[(8 - 3)];
1168  step[3] = output[3] + output[(8 - 4)];
1169  step[4] = -output[4] + output[(8 - 5)];
1170  step[5] = -output[5] + output[(8 - 6)];
1171  step[6] = -output[6] + output[(8 - 7)];
1172  step[7] = -output[7] + output[(8 - 8)];
1173  step[8] = output[8];
1174  step[9] = output[9];
1175  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
1176  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
1177  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
1178  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
1179  step[14] = output[14];
1180  step[15] = output[15];
1181
1182  step[16] = output[16] + output[23];
1183  step[17] = output[17] + output[22];
1184  step[18] = output[18] + output[21];
1185  step[19] = output[19] + output[20];
1186  step[20] = -output[20] + output[19];
1187  step[21] = -output[21] + output[18];
1188  step[22] = -output[22] + output[17];
1189  step[23] = -output[23] + output[16];
1190  step[24] = -output[24] + output[31];
1191  step[25] = -output[25] + output[30];
1192  step[26] = -output[26] + output[29];
1193  step[27] = -output[27] + output[28];
1194  step[28] = output[28] + output[27];
1195  step[29] = output[29] + output[26];
1196  step[30] = output[30] + output[25];
1197  step[31] = output[31] + output[24];
1198
1199  // Stage 4
1200  output[0] = step[0] + step[3];
1201  output[1] = step[1] + step[2];
1202  output[2] = -step[2] + step[1];
1203  output[3] = -step[3] + step[0];
1204  output[4] = step[4];
1205  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
1206  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
1207  output[7] = step[7];
1208  output[8] = step[8] + step[11];
1209  output[9] = step[9] + step[10];
1210  output[10] = -step[10] + step[9];
1211  output[11] = -step[11] + step[8];
1212  output[12] = -step[12] + step[15];
1213  output[13] = -step[13] + step[14];
1214  output[14] = step[14] + step[13];
1215  output[15] = step[15] + step[12];
1216
1217  output[16] = step[16];
1218  output[17] = step[17];
1219  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
1220  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
1221  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
1222  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
1223  output[22] = step[22];
1224  output[23] = step[23];
1225  output[24] = step[24];
1226  output[25] = step[25];
1227  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
1228  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
1229  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
1230  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
1231  output[30] = step[30];
1232  output[31] = step[31];
1233
1234  // Stage 5
1235  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
1236  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
1237  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
1238  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
1239  step[4] = output[4] + output[5];
1240  step[5] = -output[5] + output[4];
1241  step[6] = -output[6] + output[7];
1242  step[7] = output[7] + output[6];
1243  step[8] = output[8];
1244  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
1245  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
1246  step[11] = output[11];
1247  step[12] = output[12];
1248  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
1249  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
1250  step[15] = output[15];
1251
1252  step[16] = output[16] + output[19];
1253  step[17] = output[17] + output[18];
1254  step[18] = -output[18] + output[17];
1255  step[19] = -output[19] + output[16];
1256  step[20] = -output[20] + output[23];
1257  step[21] = -output[21] + output[22];
1258  step[22] = output[22] + output[21];
1259  step[23] = output[23] + output[20];
1260  step[24] = output[24] + output[27];
1261  step[25] = output[25] + output[26];
1262  step[26] = -output[26] + output[25];
1263  step[27] = -output[27] + output[24];
1264  step[28] = -output[28] + output[31];
1265  step[29] = -output[29] + output[30];
1266  step[30] = output[30] + output[29];
1267  step[31] = output[31] + output[28];
1268
1269  // Stage 6
1270  output[0] = step[0];
1271  output[1] = step[1];
1272  output[2] = step[2];
1273  output[3] = step[3];
1274  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
1275  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
1276  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
1277  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
1278  output[8] = step[8] + step[9];
1279  output[9] = -step[9] + step[8];
1280  output[10] = -step[10] + step[11];
1281  output[11] = step[11] + step[10];
1282  output[12] = step[12] + step[13];
1283  output[13] = -step[13] + step[12];
1284  output[14] = -step[14] + step[15];
1285  output[15] = step[15] + step[14];
1286
1287  output[16] = step[16];
1288  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
1289  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
1290  output[19] = step[19];
1291  output[20] = step[20];
1292  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
1293  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
1294  output[23] = step[23];
1295  output[24] = step[24];
1296  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
1297  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
1298  output[27] = step[27];
1299  output[28] = step[28];
1300  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
1301  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
1302  output[31] = step[31];
1303
1304  // Stage 7
1305  step[0] = output[0];
1306  step[1] = output[1];
1307  step[2] = output[2];
1308  step[3] = output[3];
1309  step[4] = output[4];
1310  step[5] = output[5];
1311  step[6] = output[6];
1312  step[7] = output[7];
1313  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
1314  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
1315  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
1316  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
1317  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
1318  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
1319  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
1320  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
1321
1322  step[16] = output[16] + output[17];
1323  step[17] = -output[17] + output[16];
1324  step[18] = -output[18] + output[19];
1325  step[19] = output[19] + output[18];
1326  step[20] = output[20] + output[21];
1327  step[21] = -output[21] + output[20];
1328  step[22] = -output[22] + output[23];
1329  step[23] = output[23] + output[22];
1330  step[24] = output[24] + output[25];
1331  step[25] = -output[25] + output[24];
1332  step[26] = -output[26] + output[27];
1333  step[27] = output[27] + output[26];
1334  step[28] = output[28] + output[29];
1335  step[29] = -output[29] + output[28];
1336  step[30] = -output[30] + output[31];
1337  step[31] = output[31] + output[30];
1338
1339  // Final stage --- outputs indices are bit-reversed.
1340  output[0]  = step[0];
1341  output[16] = step[1];
1342  output[8]  = step[2];
1343  output[24] = step[3];
1344  output[4]  = step[4];
1345  output[20] = step[5];
1346  output[12] = step[6];
1347  output[28] = step[7];
1348  output[2]  = step[8];
1349  output[18] = step[9];
1350  output[10] = step[10];
1351  output[26] = step[11];
1352  output[6]  = step[12];
1353  output[22] = step[13];
1354  output[14] = step[14];
1355  output[30] = step[15];
1356
1357  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
1358  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
1359  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
1360  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
1361  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
1362  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
1363  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
1364  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
1365  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
1366  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
1367  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
1368  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
1369  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
1370  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
1371  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
1372  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
1373}
1374
1375void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
1376  int r, c;
1377  tran_low_t sum = 0;
1378  for (r = 0; r < 32; ++r)
1379    for (c = 0; c < 32; ++c)
1380      sum += input[r * stride + c];
1381
1382  output[0] = sum >> 3;
1383  output[1] = 0;
1384}
1385
1386void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
1387  int i, j;
1388  tran_high_t output[32 * 32];
1389
1390  // Columns
1391  for (i = 0; i < 32; ++i) {
1392    tran_high_t temp_in[32], temp_out[32];
1393    for (j = 0; j < 32; ++j)
1394      temp_in[j] = input[j * stride + i] * 4;
1395    fdct32(temp_in, temp_out, 0);
1396    for (j = 0; j < 32; ++j)
1397      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1398  }
1399
1400  // Rows
1401  for (i = 0; i < 32; ++i) {
1402    tran_high_t temp_in[32], temp_out[32];
1403    for (j = 0; j < 32; ++j)
1404      temp_in[j] = output[j + i * 32];
1405    fdct32(temp_in, temp_out, 0);
1406    for (j = 0; j < 32; ++j)
1407      out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1408  }
1409}
1410
1411// Note that although we use dct_32_round in dct32 computation flow,
1412// this 2d fdct32x32 for rate-distortion optimization loop is operating
1413// within 16 bits precision.
1414void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
1415  int i, j;
1416  tran_high_t output[32 * 32];
1417
1418  // Columns
1419  for (i = 0; i < 32; ++i) {
1420    tran_high_t temp_in[32], temp_out[32];
1421    for (j = 0; j < 32; ++j)
1422      temp_in[j] = input[j * stride + i] * 4;
1423    fdct32(temp_in, temp_out, 0);
1424    for (j = 0; j < 32; ++j)
1425      // TODO(cd): see quality impact of only doing
1426      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
1427      //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
1428      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1429  }
1430
1431  // Rows
1432  for (i = 0; i < 32; ++i) {
1433    tran_high_t temp_in[32], temp_out[32];
1434    for (j = 0; j < 32; ++j)
1435      temp_in[j] = output[j + i * 32];
1436    fdct32(temp_in, temp_out, 1);
1437    for (j = 0; j < 32; ++j)
1438      out[j + i * 32] = temp_out[j];
1439  }
1440}
1441
1442#if CONFIG_VP9_HIGHBITDEPTH
1443void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
1444  vp9_fdct4x4_c(input, output, stride);
1445}
1446
1447void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output,
1448                       int stride, int tx_type) {
1449  vp9_fht4x4_c(input, output, stride, tx_type);
1450}
1451
1452void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
1453                          int stride) {
1454  vp9_fdct8x8_1_c(input, final_output, stride);
1455}
1456
1457void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
1458                        int stride) {
1459  vp9_fdct8x8_c(input, final_output, stride);
1460}
1461
1462void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
1463                            int stride) {
1464  vp9_fdct16x16_1_c(input, output, stride);
1465}
1466
1467void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output,
1468                          int stride) {
1469  vp9_fdct16x16_c(input, output, stride);
1470}
1471
1472void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output,
1473                  int stride, int tx_type) {
1474  vp9_fht8x8_c(input, output, stride, tx_type);
1475}
1476
1477void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
1478  vp9_fwht4x4_c(input, output, stride);
1479}
1480
1481void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output,
1482                    int stride, int tx_type) {
1483  vp9_fht16x16_c(input, output, stride, tx_type);
1484}
1485
1486void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) {
1487  vp9_fdct32x32_1_c(input, out, stride);
1488}
1489
1490void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
1491  vp9_fdct32x32_c(input, out, stride);
1492}
1493
1494void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
1495                             int stride) {
1496  vp9_fdct32x32_rd_c(input, out, stride);
1497}
1498#endif  // CONFIG_VP9_HIGHBITDEPTH
1499