1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include <assert.h>
13#include <math.h>
14#include "./vpx_config.h"
15#include "vp9/common/vp9_systemdependent.h"
16
17#include "vp9/common/vp9_blockd.h"
18#include "vp9/common/vp9_idct.h"
19
20static void fdct4_1d(int16_t *input, int16_t *output) {
21  int16_t step[4];
22  int temp1, temp2;
23
24  step[0] = input[0] + input[3];
25  step[1] = input[1] + input[2];
26  step[2] = input[1] - input[2];
27  step[3] = input[0] - input[3];
28
29  temp1 = (step[0] + step[1]) * cospi_16_64;
30  temp2 = (step[0] - step[1]) * cospi_16_64;
31  output[0] = dct_const_round_shift(temp1);
32  output[2] = dct_const_round_shift(temp2);
33  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
34  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
35  output[1] = dct_const_round_shift(temp1);
36  output[3] = dct_const_round_shift(temp2);
37}
38
39void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
40  // The 2D transform is done with two passes which are actually pretty
41  // similar. In the first one, we transform the columns and transpose
42  // the results. In the second one, we transform the rows. To achieve that,
43  // as the first pass results are transposed, we tranpose the columns (that
44  // is the transposed rows) and transpose the results (so that it goes back
45  // in normal/row positions).
46  const int stride = pitch >> 1;
47  int pass;
48  // We need an intermediate buffer between passes.
49  int16_t intermediate[4 * 4];
50  int16_t *in = input;
51  int16_t *out = intermediate;
52  // Do the two transform/transpose passes
53  for (pass = 0; pass < 2; ++pass) {
54    /*canbe16*/ int input[4];
55    /*canbe16*/ int step[4];
56    /*needs32*/ int temp1, temp2;
57    int i;
58    for (i = 0; i < 4; ++i) {
59      // Load inputs.
60      if (0 == pass) {
61        input[0] = in[0 * stride] << 4;
62        input[1] = in[1 * stride] << 4;
63        input[2] = in[2 * stride] << 4;
64        input[3] = in[3 * stride] << 4;
65        if (i == 0 && input[0]) {
66          input[0] += 1;
67        }
68      } else {
69        input[0] = in[0 * 4];
70        input[1] = in[1 * 4];
71        input[2] = in[2 * 4];
72        input[3] = in[3 * 4];
73      }
74      // Transform.
75      step[0] = input[0] + input[3];
76      step[1] = input[1] + input[2];
77      step[2] = input[1] - input[2];
78      step[3] = input[0] - input[3];
79      temp1 = (step[0] + step[1]) * cospi_16_64;
80      temp2 = (step[0] - step[1]) * cospi_16_64;
81      out[0] = dct_const_round_shift(temp1);
82      out[2] = dct_const_round_shift(temp2);
83      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
84      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
85      out[1] = dct_const_round_shift(temp1);
86      out[3] = dct_const_round_shift(temp2);
87      // Do next column (which is a transposed row in second/horizontal pass)
88      in++;
89      out += 4;
90    }
91    // Setup in/out for next pass.
92    in = intermediate;
93    out = output;
94  }
95
96  {
97    int i, j;
98    for (i = 0; i < 4; ++i) {
99      for (j = 0; j < 4; ++j)
100        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
101    }
102  }
103}
104
105static void fadst4_1d(int16_t *input, int16_t *output) {
106  int x0, x1, x2, x3;
107  int s0, s1, s2, s3, s4, s5, s6, s7;
108
109  x0 = input[0];
110  x1 = input[1];
111  x2 = input[2];
112  x3 = input[3];
113
114  if (!(x0 | x1 | x2 | x3)) {
115    output[0] = output[1] = output[2] = output[3] = 0;
116    return;
117  }
118
119  s0 = sinpi_1_9 * x0;
120  s1 = sinpi_4_9 * x0;
121  s2 = sinpi_2_9 * x1;
122  s3 = sinpi_1_9 * x1;
123  s4 = sinpi_3_9 * x2;
124  s5 = sinpi_4_9 * x3;
125  s6 = sinpi_2_9 * x3;
126  s7 = x0 + x1 - x3;
127
128  x0 = s0 + s2 + s5;
129  x1 = sinpi_3_9 * s7;
130  x2 = s1 - s3 + s6;
131  x3 = s4;
132
133  s0 = x0 + x3;
134  s1 = x1;
135  s2 = x2 - x3;
136  s3 = x2 - x0 + x3;
137
138  // 1-D transform scaling factor is sqrt(2).
139  output[0] = dct_const_round_shift(s0);
140  output[1] = dct_const_round_shift(s1);
141  output[2] = dct_const_round_shift(s2);
142  output[3] = dct_const_round_shift(s3);
143}
144
145static const transform_2d FHT_4[] = {
146  { fdct4_1d,  fdct4_1d  },  // DCT_DCT  = 0
147  { fadst4_1d, fdct4_1d  },  // ADST_DCT = 1
148  { fdct4_1d,  fadst4_1d },  // DCT_ADST = 2
149  { fadst4_1d, fadst4_1d }   // ADST_ADST = 3
150};
151
152void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
153                        int pitch, TX_TYPE tx_type) {
154  int16_t out[4 * 4];
155  int16_t *outptr = &out[0];
156  int i, j;
157  int16_t temp_in[4], temp_out[4];
158  const transform_2d ht = FHT_4[tx_type];
159
160  // Columns
161  for (i = 0; i < 4; ++i) {
162    for (j = 0; j < 4; ++j)
163      temp_in[j] = input[j * pitch + i] << 4;
164    if (i == 0 && temp_in[0])
165      temp_in[0] += 1;
166    ht.cols(temp_in, temp_out);
167    for (j = 0; j < 4; ++j)
168      outptr[j * 4 + i] = temp_out[j];
169  }
170
171  // Rows
172  for (i = 0; i < 4; ++i) {
173    for (j = 0; j < 4; ++j)
174      temp_in[j] = out[j + i * 4];
175    ht.rows(temp_in, temp_out);
176    for (j = 0; j < 4; ++j)
177      output[j + i * 4] = (temp_out[j] + 1) >> 2;
178  }
179}
180
181void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) {
182    vp9_short_fdct4x4_c(input, output, pitch);
183    vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
184}
185
186static void fdct8_1d(int16_t *input, int16_t *output) {
187  /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
188  /*needs32*/ int t0, t1, t2, t3;
189  /*canbe16*/ int x0, x1, x2, x3;
190
191  // stage 1
192  s0 = input[0] + input[7];
193  s1 = input[1] + input[6];
194  s2 = input[2] + input[5];
195  s3 = input[3] + input[4];
196  s4 = input[3] - input[4];
197  s5 = input[2] - input[5];
198  s6 = input[1] - input[6];
199  s7 = input[0] - input[7];
200
201  // fdct4_1d(step, step);
202  x0 = s0 + s3;
203  x1 = s1 + s2;
204  x2 = s1 - s2;
205  x3 = s0 - s3;
206  t0 = (x0 + x1) * cospi_16_64;
207  t1 = (x0 - x1) * cospi_16_64;
208  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
209  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
210  output[0] = dct_const_round_shift(t0);
211  output[2] = dct_const_round_shift(t2);
212  output[4] = dct_const_round_shift(t1);
213  output[6] = dct_const_round_shift(t3);
214
215  // Stage 2
216  t0 = (s6 - s5) * cospi_16_64;
217  t1 = (s6 + s5) * cospi_16_64;
218  t2 = dct_const_round_shift(t0);
219  t3 = dct_const_round_shift(t1);
220
221  // Stage 3
222  x0 = s4 + t2;
223  x1 = s4 - t2;
224  x2 = s7 - t3;
225  x3 = s7 + t3;
226
227  // Stage 4
228  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
229  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
230  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
231  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
232  output[1] = dct_const_round_shift(t0);
233  output[3] = dct_const_round_shift(t2);
234  output[5] = dct_const_round_shift(t1);
235  output[7] = dct_const_round_shift(t3);
236}
237
238void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
239  const int stride = pitch >> 1;
240  int i, j;
241  int16_t intermediate[64];
242
243  // Transform columns
244  {
245    int16_t *output = intermediate;
246    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
247    /*needs32*/ int t0, t1, t2, t3;
248    /*canbe16*/ int x0, x1, x2, x3;
249
250    int i;
251    for (i = 0; i < 8; i++) {
252      // stage 1
253      s0 = (input[0 * stride] + input[7 * stride]) << 2;
254      s1 = (input[1 * stride] + input[6 * stride]) << 2;
255      s2 = (input[2 * stride] + input[5 * stride]) << 2;
256      s3 = (input[3 * stride] + input[4 * stride]) << 2;
257      s4 = (input[3 * stride] - input[4 * stride]) << 2;
258      s5 = (input[2 * stride] - input[5 * stride]) << 2;
259      s6 = (input[1 * stride] - input[6 * stride]) << 2;
260      s7 = (input[0 * stride] - input[7 * stride]) << 2;
261
262      // fdct4_1d(step, step);
263      x0 = s0 + s3;
264      x1 = s1 + s2;
265      x2 = s1 - s2;
266      x3 = s0 - s3;
267      t0 = (x0 + x1) * cospi_16_64;
268      t1 = (x0 - x1) * cospi_16_64;
269      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
270      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
271      output[0 * 8] = dct_const_round_shift(t0);
272      output[2 * 8] = dct_const_round_shift(t2);
273      output[4 * 8] = dct_const_round_shift(t1);
274      output[6 * 8] = dct_const_round_shift(t3);
275
276      // Stage 2
277      t0 = (s6 - s5) * cospi_16_64;
278      t1 = (s6 + s5) * cospi_16_64;
279      t2 = dct_const_round_shift(t0);
280      t3 = dct_const_round_shift(t1);
281
282      // Stage 3
283      x0 = s4 + t2;
284      x1 = s4 - t2;
285      x2 = s7 - t3;
286      x3 = s7 + t3;
287
288      // Stage 4
289      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
290      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
291      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
292      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
293      output[1 * 8] = dct_const_round_shift(t0);
294      output[3 * 8] = dct_const_round_shift(t2);
295      output[5 * 8] = dct_const_round_shift(t1);
296      output[7 * 8] = dct_const_round_shift(t3);
297      input++;
298      output++;
299    }
300  }
301
302  // Rows
303  for (i = 0; i < 8; ++i) {
304    fdct8_1d(&intermediate[i * 8], &final_output[i * 8]);
305    for (j = 0; j < 8; ++j)
306      final_output[j + i * 8] /= 2;
307  }
308}
309
310void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
311  // The 2D transform is done with two passes which are actually pretty
312  // similar. In the first one, we transform the columns and transpose
313  // the results. In the second one, we transform the rows. To achieve that,
314  // as the first pass results are transposed, we tranpose the columns (that
315  // is the transposed rows) and transpose the results (so that it goes back
316  // in normal/row positions).
317  const int stride = pitch >> 1;
318  int pass;
319  // We need an intermediate buffer between passes.
320  int16_t intermediate[256];
321  int16_t *in = input;
322  int16_t *out = intermediate;
323  // Do the two transform/transpose passes
324  for (pass = 0; pass < 2; ++pass) {
325    /*canbe16*/ int step1[8];
326    /*canbe16*/ int step2[8];
327    /*canbe16*/ int step3[8];
328    /*canbe16*/ int input[8];
329    /*needs32*/ int temp1, temp2;
330    int i;
331    for (i = 0; i < 16; i++) {
332      if (0 == pass) {
333        // Calculate input for the first 8 results.
334        input[0] = (in[0 * stride] + in[15 * stride]) << 2;
335        input[1] = (in[1 * stride] + in[14 * stride]) << 2;
336        input[2] = (in[2 * stride] + in[13 * stride]) << 2;
337        input[3] = (in[3 * stride] + in[12 * stride]) << 2;
338        input[4] = (in[4 * stride] + in[11 * stride]) << 2;
339        input[5] = (in[5 * stride] + in[10 * stride]) << 2;
340        input[6] = (in[6 * stride] + in[ 9 * stride]) << 2;
341        input[7] = (in[7 * stride] + in[ 8 * stride]) << 2;
342        // Calculate input for the next 8 results.
343        step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2;
344        step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2;
345        step1[2] = (in[5 * stride] - in[10 * stride]) << 2;
346        step1[3] = (in[4 * stride] - in[11 * stride]) << 2;
347        step1[4] = (in[3 * stride] - in[12 * stride]) << 2;
348        step1[5] = (in[2 * stride] - in[13 * stride]) << 2;
349        step1[6] = (in[1 * stride] - in[14 * stride]) << 2;
350        step1[7] = (in[0 * stride] - in[15 * stride]) << 2;
351      } else {
352        // Calculate input for the first 8 results.
353        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
354        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
355        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
356        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
357        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
358        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
359        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
360        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
361        // Calculate input for the next 8 results.
362        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
363        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
364        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
365        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
366        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
367        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
368        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
369        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
370      }
371      // Work on the first eight values; fdct8_1d(input, even_results);
372      {
373        /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
374        /*needs32*/ int t0, t1, t2, t3;
375        /*canbe16*/ int x0, x1, x2, x3;
376
377        // stage 1
378        s0 = input[0] + input[7];
379        s1 = input[1] + input[6];
380        s2 = input[2] + input[5];
381        s3 = input[3] + input[4];
382        s4 = input[3] - input[4];
383        s5 = input[2] - input[5];
384        s6 = input[1] - input[6];
385        s7 = input[0] - input[7];
386
387        // fdct4_1d(step, step);
388        x0 = s0 + s3;
389        x1 = s1 + s2;
390        x2 = s1 - s2;
391        x3 = s0 - s3;
392        t0 = (x0 + x1) * cospi_16_64;
393        t1 = (x0 - x1) * cospi_16_64;
394        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
395        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
396        out[0] = dct_const_round_shift(t0);
397        out[4] = dct_const_round_shift(t2);
398        out[8] = dct_const_round_shift(t1);
399        out[12] = dct_const_round_shift(t3);
400
401        // Stage 2
402        t0 = (s6 - s5) * cospi_16_64;
403        t1 = (s6 + s5) * cospi_16_64;
404        t2 = dct_const_round_shift(t0);
405        t3 = dct_const_round_shift(t1);
406
407        // Stage 3
408        x0 = s4 + t2;
409        x1 = s4 - t2;
410        x2 = s7 - t3;
411        x3 = s7 + t3;
412
413        // Stage 4
414        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
415        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
416        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
417        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
418        out[2] = dct_const_round_shift(t0);
419        out[6] = dct_const_round_shift(t2);
420        out[10] = dct_const_round_shift(t1);
421        out[14] = dct_const_round_shift(t3);
422      }
423      // Work on the next eight values; step1 -> odd_results
424      {
425        // step 2
426        temp1 = (step1[5] - step1[2]) * cospi_16_64;
427        temp2 = (step1[4] - step1[3]) * cospi_16_64;
428        step2[2] = dct_const_round_shift(temp1);
429        step2[3] = dct_const_round_shift(temp2);
430        temp1 = (step1[4] + step1[3]) * cospi_16_64;
431        temp2 = (step1[5] + step1[2]) * cospi_16_64;
432        step2[4] = dct_const_round_shift(temp1);
433        step2[5] = dct_const_round_shift(temp2);
434        // step 3
435        step3[0] = step1[0] + step2[3];
436        step3[1] = step1[1] + step2[2];
437        step3[2] = step1[1] - step2[2];
438        step3[3] = step1[0] - step2[3];
439        step3[4] = step1[7] - step2[4];
440        step3[5] = step1[6] - step2[5];
441        step3[6] = step1[6] + step2[5];
442        step3[7] = step1[7] + step2[4];
443        // step 4
444        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
445        temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
446        step2[1] = dct_const_round_shift(temp1);
447        step2[2] = dct_const_round_shift(temp2);
448        temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
449        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
450        step2[5] = dct_const_round_shift(temp1);
451        step2[6] = dct_const_round_shift(temp2);
452        // step 5
453        step1[0] = step3[0] + step2[1];
454        step1[1] = step3[0] - step2[1];
455        step1[2] = step3[3] - step2[2];
456        step1[3] = step3[3] + step2[2];
457        step1[4] = step3[4] + step2[5];
458        step1[5] = step3[4] - step2[5];
459        step1[6] = step3[7] - step2[6];
460        step1[7] = step3[7] + step2[6];
461        // step 6
462        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
463        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
464        out[1] = dct_const_round_shift(temp1);
465        out[9] = dct_const_round_shift(temp2);
466        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
467        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
468        out[5] = dct_const_round_shift(temp1);
469        out[13] = dct_const_round_shift(temp2);
470        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
471        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
472        out[3] = dct_const_round_shift(temp1);
473        out[11] = dct_const_round_shift(temp2);
474        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
475        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
476        out[7] = dct_const_round_shift(temp1);
477        out[15] = dct_const_round_shift(temp2);
478      }
479      // Do next column (which is a transposed row in second/horizontal pass)
480      in++;
481      out += 16;
482    }
483    // Setup in/out for next pass.
484    in = intermediate;
485    out = output;
486  }
487}
488
489static void fadst8_1d(int16_t *input, int16_t *output) {
490  int s0, s1, s2, s3, s4, s5, s6, s7;
491
492  int x0 = input[7];
493  int x1 = input[0];
494  int x2 = input[5];
495  int x3 = input[2];
496  int x4 = input[3];
497  int x5 = input[4];
498  int x6 = input[1];
499  int x7 = input[6];
500
501  // stage 1
502  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
503  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
504  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
505  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
506  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
507  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
508  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
509  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
510
511  x0 = dct_const_round_shift(s0 + s4);
512  x1 = dct_const_round_shift(s1 + s5);
513  x2 = dct_const_round_shift(s2 + s6);
514  x3 = dct_const_round_shift(s3 + s7);
515  x4 = dct_const_round_shift(s0 - s4);
516  x5 = dct_const_round_shift(s1 - s5);
517  x6 = dct_const_round_shift(s2 - s6);
518  x7 = dct_const_round_shift(s3 - s7);
519
520  // stage 2
521  s0 = x0;
522  s1 = x1;
523  s2 = x2;
524  s3 = x3;
525  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
526  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
527  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
528  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
529
530  x0 = s0 + s2;
531  x1 = s1 + s3;
532  x2 = s0 - s2;
533  x3 = s1 - s3;
534  x4 = dct_const_round_shift(s4 + s6);
535  x5 = dct_const_round_shift(s5 + s7);
536  x6 = dct_const_round_shift(s4 - s6);
537  x7 = dct_const_round_shift(s5 - s7);
538
539  // stage 3
540  s2 = cospi_16_64 * (x2 + x3);
541  s3 = cospi_16_64 * (x2 - x3);
542  s6 = cospi_16_64 * (x6 + x7);
543  s7 = cospi_16_64 * (x6 - x7);
544
545  x2 = dct_const_round_shift(s2);
546  x3 = dct_const_round_shift(s3);
547  x6 = dct_const_round_shift(s6);
548  x7 = dct_const_round_shift(s7);
549
550  output[0] =   x0;
551  output[1] = - x4;
552  output[2] =   x6;
553  output[3] = - x2;
554  output[4] =   x3;
555  output[5] = - x7;
556  output[6] =   x5;
557  output[7] = - x1;
558}
559
560static const transform_2d FHT_8[] = {
561  { fdct8_1d,  fdct8_1d  },  // DCT_DCT  = 0
562  { fadst8_1d, fdct8_1d  },  // ADST_DCT = 1
563  { fdct8_1d,  fadst8_1d },  // DCT_ADST = 2
564  { fadst8_1d, fadst8_1d }   // ADST_ADST = 3
565};
566
567void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
568                        int pitch, TX_TYPE tx_type) {
569  int16_t out[64];
570  int16_t *outptr = &out[0];
571  int i, j;
572  int16_t temp_in[8], temp_out[8];
573  const transform_2d ht = FHT_8[tx_type];
574
575  // Columns
576  for (i = 0; i < 8; ++i) {
577    for (j = 0; j < 8; ++j)
578      temp_in[j] = input[j * pitch + i] << 2;
579    ht.cols(temp_in, temp_out);
580    for (j = 0; j < 8; ++j)
581      outptr[j * 8 + i] = temp_out[j];
582  }
583
584  // Rows
585  for (i = 0; i < 8; ++i) {
586    for (j = 0; j < 8; ++j)
587      temp_in[j] = out[j + i * 8];
588    ht.rows(temp_in, temp_out);
589    for (j = 0; j < 8; ++j)
590      output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
591  }
592}
593
594/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
595   pixel. */
596void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
597  int i;
598  int a1, b1, c1, d1, e1;
599  short *ip = input;
600  short *op = output;
601  int pitch_short = pitch >> 1;
602
603  for (i = 0; i < 4; i++) {
604    a1 = ip[0 * pitch_short];
605    b1 = ip[1 * pitch_short];
606    c1 = ip[2 * pitch_short];
607    d1 = ip[3 * pitch_short];
608
609    a1 += b1;
610    d1 = d1 - c1;
611    e1 = (a1 - d1) >> 1;
612    b1 = e1 - b1;
613    c1 = e1 - c1;
614    a1 -= c1;
615    d1 += b1;
616    op[0] = a1;
617    op[4] = c1;
618    op[8] = d1;
619    op[12] = b1;
620
621    ip++;
622    op++;
623  }
624  ip = output;
625  op = output;
626
627  for (i = 0; i < 4; i++) {
628    a1 = ip[0];
629    b1 = ip[1];
630    c1 = ip[2];
631    d1 = ip[3];
632
633    a1 += b1;
634    d1 -= c1;
635    e1 = (a1 - d1) >> 1;
636    b1 = e1 - b1;
637    c1 = e1 - c1;
638    a1 -= c1;
639    d1 += b1;
640    op[0] = a1 << WHT_UPSCALE_FACTOR;
641    op[1] = c1 << WHT_UPSCALE_FACTOR;
642    op[2] = d1 << WHT_UPSCALE_FACTOR;
643    op[3] = b1 << WHT_UPSCALE_FACTOR;
644
645    ip += 4;
646    op += 4;
647  }
648}
649
650void vp9_short_walsh8x4_c(short *input, short *output, int pitch) {
651  vp9_short_walsh4x4_c(input,   output,    pitch);
652  vp9_short_walsh4x4_c(input + 4, output + 16, pitch);
653}
654
655
656// Rewrote to use same algorithm as others.
657static void fdct16_1d(int16_t in[16], int16_t out[16]) {
658  /*canbe16*/ int step1[8];
659  /*canbe16*/ int step2[8];
660  /*canbe16*/ int step3[8];
661  /*canbe16*/ int input[8];
662  /*needs32*/ int temp1, temp2;
663
664  // step 1
665  input[0] = in[0] + in[15];
666  input[1] = in[1] + in[14];
667  input[2] = in[2] + in[13];
668  input[3] = in[3] + in[12];
669  input[4] = in[4] + in[11];
670  input[5] = in[5] + in[10];
671  input[6] = in[6] + in[ 9];
672  input[7] = in[7] + in[ 8];
673
674  step1[0] = in[7] - in[ 8];
675  step1[1] = in[6] - in[ 9];
676  step1[2] = in[5] - in[10];
677  step1[3] = in[4] - in[11];
678  step1[4] = in[3] - in[12];
679  step1[5] = in[2] - in[13];
680  step1[6] = in[1] - in[14];
681  step1[7] = in[0] - in[15];
682
683  // fdct8_1d(step, step);
684  {
685    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
686    /*needs32*/ int t0, t1, t2, t3;
687    /*canbe16*/ int x0, x1, x2, x3;
688
689    // stage 1
690    s0 = input[0] + input[7];
691    s1 = input[1] + input[6];
692    s2 = input[2] + input[5];
693    s3 = input[3] + input[4];
694    s4 = input[3] - input[4];
695    s5 = input[2] - input[5];
696    s6 = input[1] - input[6];
697    s7 = input[0] - input[7];
698
699    // fdct4_1d(step, step);
700    x0 = s0 + s3;
701    x1 = s1 + s2;
702    x2 = s1 - s2;
703    x3 = s0 - s3;
704    t0 = (x0 + x1) * cospi_16_64;
705    t1 = (x0 - x1) * cospi_16_64;
706    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
707    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
708    out[0] = dct_const_round_shift(t0);
709    out[4] = dct_const_round_shift(t2);
710    out[8] = dct_const_round_shift(t1);
711    out[12] = dct_const_round_shift(t3);
712
713    // Stage 2
714    t0 = (s6 - s5) * cospi_16_64;
715    t1 = (s6 + s5) * cospi_16_64;
716    t2 = dct_const_round_shift(t0);
717    t3 = dct_const_round_shift(t1);
718
719    // Stage 3
720    x0 = s4 + t2;
721    x1 = s4 - t2;
722    x2 = s7 - t3;
723    x3 = s7 + t3;
724
725    // Stage 4
726    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
727    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
728    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
729    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
730    out[2] = dct_const_round_shift(t0);
731    out[6] = dct_const_round_shift(t2);
732    out[10] = dct_const_round_shift(t1);
733    out[14] = dct_const_round_shift(t3);
734  }
735
736  // step 2
737  temp1 = (step1[5] - step1[2]) * cospi_16_64;
738  temp2 = (step1[4] - step1[3]) * cospi_16_64;
739  step2[2] = dct_const_round_shift(temp1);
740  step2[3] = dct_const_round_shift(temp2);
741  temp1 = (step1[4] + step1[3]) * cospi_16_64;
742  temp2 = (step1[5] + step1[2]) * cospi_16_64;
743  step2[4] = dct_const_round_shift(temp1);
744  step2[5] = dct_const_round_shift(temp2);
745
746  // step 3
747  step3[0] = step1[0] + step2[3];
748  step3[1] = step1[1] + step2[2];
749  step3[2] = step1[1] - step2[2];
750  step3[3] = step1[0] - step2[3];
751  step3[4] = step1[7] - step2[4];
752  step3[5] = step1[6] - step2[5];
753  step3[6] = step1[6] + step2[5];
754  step3[7] = step1[7] + step2[4];
755
756  // step 4
757  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
758  temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
759  step2[1] = dct_const_round_shift(temp1);
760  step2[2] = dct_const_round_shift(temp2);
761  temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
762  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
763  step2[5] = dct_const_round_shift(temp1);
764  step2[6] = dct_const_round_shift(temp2);
765
766  // step 5
767  step1[0] = step3[0] + step2[1];
768  step1[1] = step3[0] - step2[1];
769  step1[2] = step3[3] - step2[2];
770  step1[3] = step3[3] + step2[2];
771  step1[4] = step3[4] + step2[5];
772  step1[5] = step3[4] - step2[5];
773  step1[6] = step3[7] - step2[6];
774  step1[7] = step3[7] + step2[6];
775
776  // step 6
777  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
778  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
779  out[1] = dct_const_round_shift(temp1);
780  out[9] = dct_const_round_shift(temp2);
781
782  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
783  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
784  out[5] = dct_const_round_shift(temp1);
785  out[13] = dct_const_round_shift(temp2);
786
787  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
788  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
789  out[3] = dct_const_round_shift(temp1);
790  out[11] = dct_const_round_shift(temp2);
791
792  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
793  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
794  out[7] = dct_const_round_shift(temp1);
795  out[15] = dct_const_round_shift(temp2);
796}
797
798void fadst16_1d(int16_t *input, int16_t *output) {
799  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
800
801  int x0 = input[15];
802  int x1 = input[0];
803  int x2 = input[13];
804  int x3 = input[2];
805  int x4 = input[11];
806  int x5 = input[4];
807  int x6 = input[9];
808  int x7 = input[6];
809  int x8 = input[7];
810  int x9 = input[8];
811  int x10 = input[5];
812  int x11 = input[10];
813  int x12 = input[3];
814  int x13 = input[12];
815  int x14 = input[1];
816  int x15 = input[14];
817
818  // stage 1
819  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
820  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
821  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
822  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
823  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
824  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
825  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
826  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
827  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
828  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
829  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
830  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
831  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
832  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
833  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
834  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
835
836  x0 = dct_const_round_shift(s0 + s8);
837  x1 = dct_const_round_shift(s1 + s9);
838  x2 = dct_const_round_shift(s2 + s10);
839  x3 = dct_const_round_shift(s3 + s11);
840  x4 = dct_const_round_shift(s4 + s12);
841  x5 = dct_const_round_shift(s5 + s13);
842  x6 = dct_const_round_shift(s6 + s14);
843  x7 = dct_const_round_shift(s7 + s15);
844  x8  = dct_const_round_shift(s0 - s8);
845  x9  = dct_const_round_shift(s1 - s9);
846  x10 = dct_const_round_shift(s2 - s10);
847  x11 = dct_const_round_shift(s3 - s11);
848  x12 = dct_const_round_shift(s4 - s12);
849  x13 = dct_const_round_shift(s5 - s13);
850  x14 = dct_const_round_shift(s6 - s14);
851  x15 = dct_const_round_shift(s7 - s15);
852
853  // stage 2
854  s0 = x0;
855  s1 = x1;
856  s2 = x2;
857  s3 = x3;
858  s4 = x4;
859  s5 = x5;
860  s6 = x6;
861  s7 = x7;
862  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
863  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
864  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
865  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
866  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
867  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
868  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
869  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
870
871  x0 = s0 + s4;
872  x1 = s1 + s5;
873  x2 = s2 + s6;
874  x3 = s3 + s7;
875  x4 = s0 - s4;
876  x5 = s1 - s5;
877  x6 = s2 - s6;
878  x7 = s3 - s7;
879  x8 = dct_const_round_shift(s8 + s12);
880  x9 = dct_const_round_shift(s9 + s13);
881  x10 = dct_const_round_shift(s10 + s14);
882  x11 = dct_const_round_shift(s11 + s15);
883  x12 = dct_const_round_shift(s8 - s12);
884  x13 = dct_const_round_shift(s9 - s13);
885  x14 = dct_const_round_shift(s10 - s14);
886  x15 = dct_const_round_shift(s11 - s15);
887
888  // stage 3
889  s0 = x0;
890  s1 = x1;
891  s2 = x2;
892  s3 = x3;
893  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
894  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
895  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
896  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
897  s8 = x8;
898  s9 = x9;
899  s10 = x10;
900  s11 = x11;
901  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
902  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
903  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
904  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
905
906  x0 = s0 + s2;
907  x1 = s1 + s3;
908  x2 = s0 - s2;
909  x3 = s1 - s3;
910  x4 = dct_const_round_shift(s4 + s6);
911  x5 = dct_const_round_shift(s5 + s7);
912  x6 = dct_const_round_shift(s4 - s6);
913  x7 = dct_const_round_shift(s5 - s7);
914  x8 = s8 + s10;
915  x9 = s9 + s11;
916  x10 = s8 - s10;
917  x11 = s9 - s11;
918  x12 = dct_const_round_shift(s12 + s14);
919  x13 = dct_const_round_shift(s13 + s15);
920  x14 = dct_const_round_shift(s12 - s14);
921  x15 = dct_const_round_shift(s13 - s15);
922
923  // stage 4
924  s2 = (- cospi_16_64) * (x2 + x3);
925  s3 = cospi_16_64 * (x2 - x3);
926  s6 = cospi_16_64 * (x6 + x7);
927  s7 = cospi_16_64 * (- x6 + x7);
928  s10 = cospi_16_64 * (x10 + x11);
929  s11 = cospi_16_64 * (- x10 + x11);
930  s14 = (- cospi_16_64) * (x14 + x15);
931  s15 = cospi_16_64 * (x14 - x15);
932
933  x2 = dct_const_round_shift(s2);
934  x3 = dct_const_round_shift(s3);
935  x6 = dct_const_round_shift(s6);
936  x7 = dct_const_round_shift(s7);
937  x10 = dct_const_round_shift(s10);
938  x11 = dct_const_round_shift(s11);
939  x14 = dct_const_round_shift(s14);
940  x15 = dct_const_round_shift(s15);
941
942  output[0] = x0;
943  output[1] = - x8;
944  output[2] = x12;
945  output[3] = - x4;
946  output[4] = x6;
947  output[5] = x14;
948  output[6] = x10;
949  output[7] = x2;
950  output[8] = x3;
951  output[9] =  x11;
952  output[10] = x15;
953  output[11] = x7;
954  output[12] = x5;
955  output[13] = - x13;
956  output[14] = x9;
957  output[15] = - x1;
958}
959
960static const transform_2d FHT_16[] = {
961  { fdct16_1d,  fdct16_1d  },  // DCT_DCT  = 0
962  { fadst16_1d, fdct16_1d  },  // ADST_DCT = 1
963  { fdct16_1d,  fadst16_1d },  // DCT_ADST = 2
964  { fadst16_1d, fadst16_1d }   // ADST_ADST = 3
965};
966
967void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
968                          int pitch, TX_TYPE tx_type) {
969  int16_t out[256];
970  int16_t *outptr = &out[0];
971  int i, j;
972  int16_t temp_in[16], temp_out[16];
973  const transform_2d ht = FHT_16[tx_type];
974
975  // Columns
976  for (i = 0; i < 16; ++i) {
977    for (j = 0; j < 16; ++j)
978      temp_in[j] = input[j * pitch + i] << 2;
979    ht.cols(temp_in, temp_out);
980    for (j = 0; j < 16; ++j)
981      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
982//      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
983  }
984
985  // Rows
986  for (i = 0; i < 16; ++i) {
987    for (j = 0; j < 16; ++j)
988      temp_in[j] = out[j + i * 16];
989    ht.rows(temp_in, temp_out);
990    for (j = 0; j < 16; ++j)
991      output[j + i * 16] = temp_out[j];
992  }
993}
994
995static INLINE int dct_32_round(int input) {
996  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
997  assert(-131072 <= rv && rv <= 131071);
998  return rv;
999}
1000
1001static INLINE int half_round_shift(int input) {
1002  int rv = (input + 1 + (input < 0)) >> 2;
1003  return rv;
1004}
1005
1006static void dct32_1d(int *input, int *output, int round) {
1007  int step[32];
1008  // Stage 1
1009  step[0] = input[0] + input[(32 - 1)];
1010  step[1] = input[1] + input[(32 - 2)];
1011  step[2] = input[2] + input[(32 - 3)];
1012  step[3] = input[3] + input[(32 - 4)];
1013  step[4] = input[4] + input[(32 - 5)];
1014  step[5] = input[5] + input[(32 - 6)];
1015  step[6] = input[6] + input[(32 - 7)];
1016  step[7] = input[7] + input[(32 - 8)];
1017  step[8] = input[8] + input[(32 - 9)];
1018  step[9] = input[9] + input[(32 - 10)];
1019  step[10] = input[10] + input[(32 - 11)];
1020  step[11] = input[11] + input[(32 - 12)];
1021  step[12] = input[12] + input[(32 - 13)];
1022  step[13] = input[13] + input[(32 - 14)];
1023  step[14] = input[14] + input[(32 - 15)];
1024  step[15] = input[15] + input[(32 - 16)];
1025  step[16] = -input[16] + input[(32 - 17)];
1026  step[17] = -input[17] + input[(32 - 18)];
1027  step[18] = -input[18] + input[(32 - 19)];
1028  step[19] = -input[19] + input[(32 - 20)];
1029  step[20] = -input[20] + input[(32 - 21)];
1030  step[21] = -input[21] + input[(32 - 22)];
1031  step[22] = -input[22] + input[(32 - 23)];
1032  step[23] = -input[23] + input[(32 - 24)];
1033  step[24] = -input[24] + input[(32 - 25)];
1034  step[25] = -input[25] + input[(32 - 26)];
1035  step[26] = -input[26] + input[(32 - 27)];
1036  step[27] = -input[27] + input[(32 - 28)];
1037  step[28] = -input[28] + input[(32 - 29)];
1038  step[29] = -input[29] + input[(32 - 30)];
1039  step[30] = -input[30] + input[(32 - 31)];
1040  step[31] = -input[31] + input[(32 - 32)];
1041
1042  // Stage 2
1043  output[0] = step[0] + step[16 - 1];
1044  output[1] = step[1] + step[16 - 2];
1045  output[2] = step[2] + step[16 - 3];
1046  output[3] = step[3] + step[16 - 4];
1047  output[4] = step[4] + step[16 - 5];
1048  output[5] = step[5] + step[16 - 6];
1049  output[6] = step[6] + step[16 - 7];
1050  output[7] = step[7] + step[16 - 8];
1051  output[8] = -step[8] + step[16 - 9];
1052  output[9] = -step[9] + step[16 - 10];
1053  output[10] = -step[10] + step[16 - 11];
1054  output[11] = -step[11] + step[16 - 12];
1055  output[12] = -step[12] + step[16 - 13];
1056  output[13] = -step[13] + step[16 - 14];
1057  output[14] = -step[14] + step[16 - 15];
1058  output[15] = -step[15] + step[16 - 16];
1059
1060  output[16] = step[16];
1061  output[17] = step[17];
1062  output[18] = step[18];
1063  output[19] = step[19];
1064
1065  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
1066  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
1067  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
1068  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
1069
1070  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
1071  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
1072  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
1073  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
1074
1075  output[28] = step[28];
1076  output[29] = step[29];
1077  output[30] = step[30];
1078  output[31] = step[31];
1079
1080  // dump the magnitude by 4, hence the intermediate values are within
1081  // the range of 16 bits.
1082  if (round) {
1083    output[0] = half_round_shift(output[0]);
1084    output[1] = half_round_shift(output[1]);
1085    output[2] = half_round_shift(output[2]);
1086    output[3] = half_round_shift(output[3]);
1087    output[4] = half_round_shift(output[4]);
1088    output[5] = half_round_shift(output[5]);
1089    output[6] = half_round_shift(output[6]);
1090    output[7] = half_round_shift(output[7]);
1091    output[8] = half_round_shift(output[8]);
1092    output[9] = half_round_shift(output[9]);
1093    output[10] = half_round_shift(output[10]);
1094    output[11] = half_round_shift(output[11]);
1095    output[12] = half_round_shift(output[12]);
1096    output[13] = half_round_shift(output[13]);
1097    output[14] = half_round_shift(output[14]);
1098    output[15] = half_round_shift(output[15]);
1099
1100    output[16] = half_round_shift(output[16]);
1101    output[17] = half_round_shift(output[17]);
1102    output[18] = half_round_shift(output[18]);
1103    output[19] = half_round_shift(output[19]);
1104    output[20] = half_round_shift(output[20]);
1105    output[21] = half_round_shift(output[21]);
1106    output[22] = half_round_shift(output[22]);
1107    output[23] = half_round_shift(output[23]);
1108    output[24] = half_round_shift(output[24]);
1109    output[25] = half_round_shift(output[25]);
1110    output[26] = half_round_shift(output[26]);
1111    output[27] = half_round_shift(output[27]);
1112    output[28] = half_round_shift(output[28]);
1113    output[29] = half_round_shift(output[29]);
1114    output[30] = half_round_shift(output[30]);
1115    output[31] = half_round_shift(output[31]);
1116  }
1117
1118  // Stage 3
1119  step[0] = output[0] + output[(8 - 1)];
1120  step[1] = output[1] + output[(8 - 2)];
1121  step[2] = output[2] + output[(8 - 3)];
1122  step[3] = output[3] + output[(8 - 4)];
1123  step[4] = -output[4] + output[(8 - 5)];
1124  step[5] = -output[5] + output[(8 - 6)];
1125  step[6] = -output[6] + output[(8 - 7)];
1126  step[7] = -output[7] + output[(8 - 8)];
1127  step[8] = output[8];
1128  step[9] = output[9];
1129  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
1130  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
1131  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
1132  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
1133  step[14] = output[14];
1134  step[15] = output[15];
1135
1136  step[16] = output[16] + output[23];
1137  step[17] = output[17] + output[22];
1138  step[18] = output[18] + output[21];
1139  step[19] = output[19] + output[20];
1140  step[20] = -output[20] + output[19];
1141  step[21] = -output[21] + output[18];
1142  step[22] = -output[22] + output[17];
1143  step[23] = -output[23] + output[16];
1144  step[24] = -output[24] + output[31];
1145  step[25] = -output[25] + output[30];
1146  step[26] = -output[26] + output[29];
1147  step[27] = -output[27] + output[28];
1148  step[28] = output[28] + output[27];
1149  step[29] = output[29] + output[26];
1150  step[30] = output[30] + output[25];
1151  step[31] = output[31] + output[24];
1152
1153  // Stage 4
1154  output[0] = step[0] + step[3];
1155  output[1] = step[1] + step[2];
1156  output[2] = -step[2] + step[1];
1157  output[3] = -step[3] + step[0];
1158  output[4] = step[4];
1159  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
1160  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
1161  output[7] = step[7];
1162  output[8] = step[8] + step[11];
1163  output[9] = step[9] + step[10];
1164  output[10] = -step[10] + step[9];
1165  output[11] = -step[11] + step[8];
1166  output[12] = -step[12] + step[15];
1167  output[13] = -step[13] + step[14];
1168  output[14] = step[14] + step[13];
1169  output[15] = step[15] + step[12];
1170
1171  output[16] = step[16];
1172  output[17] = step[17];
1173  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
1174  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
1175  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
1176  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
1177  output[22] = step[22];
1178  output[23] = step[23];
1179  output[24] = step[24];
1180  output[25] = step[25];
1181  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
1182  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
1183  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
1184  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
1185  output[30] = step[30];
1186  output[31] = step[31];
1187
1188  // Stage 5
1189  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
1190  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
1191  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
1192  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
1193  step[4] = output[4] + output[5];
1194  step[5] = -output[5] + output[4];
1195  step[6] = -output[6] + output[7];
1196  step[7] = output[7] + output[6];
1197  step[8] = output[8];
1198  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
1199  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
1200  step[11] = output[11];
1201  step[12] = output[12];
1202  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
1203  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
1204  step[15] = output[15];
1205
1206  step[16] = output[16] + output[19];
1207  step[17] = output[17] + output[18];
1208  step[18] = -output[18] + output[17];
1209  step[19] = -output[19] + output[16];
1210  step[20] = -output[20] + output[23];
1211  step[21] = -output[21] + output[22];
1212  step[22] = output[22] + output[21];
1213  step[23] = output[23] + output[20];
1214  step[24] = output[24] + output[27];
1215  step[25] = output[25] + output[26];
1216  step[26] = -output[26] + output[25];
1217  step[27] = -output[27] + output[24];
1218  step[28] = -output[28] + output[31];
1219  step[29] = -output[29] + output[30];
1220  step[30] = output[30] + output[29];
1221  step[31] = output[31] + output[28];
1222
1223  // Stage 6
1224  output[0] = step[0];
1225  output[1] = step[1];
1226  output[2] = step[2];
1227  output[3] = step[3];
1228  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
1229  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
1230  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
1231  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
1232  output[8] = step[8] + step[9];
1233  output[9] = -step[9] + step[8];
1234  output[10] = -step[10] + step[11];
1235  output[11] = step[11] + step[10];
1236  output[12] = step[12] + step[13];
1237  output[13] = -step[13] + step[12];
1238  output[14] = -step[14] + step[15];
1239  output[15] = step[15] + step[14];
1240
1241  output[16] = step[16];
1242  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
1243  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
1244  output[19] = step[19];
1245  output[20] = step[20];
1246  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
1247  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
1248  output[23] = step[23];
1249  output[24] = step[24];
1250  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
1251  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
1252  output[27] = step[27];
1253  output[28] = step[28];
1254  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
1255  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
1256  output[31] = step[31];
1257
1258  // Stage 7
1259  step[0] = output[0];
1260  step[1] = output[1];
1261  step[2] = output[2];
1262  step[3] = output[3];
1263  step[4] = output[4];
1264  step[5] = output[5];
1265  step[6] = output[6];
1266  step[7] = output[7];
1267  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
1268  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
1269  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
1270  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
1271  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
1272  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
1273  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
1274  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
1275
1276  step[16] = output[16] + output[17];
1277  step[17] = -output[17] + output[16];
1278  step[18] = -output[18] + output[19];
1279  step[19] = output[19] + output[18];
1280  step[20] = output[20] + output[21];
1281  step[21] = -output[21] + output[20];
1282  step[22] = -output[22] + output[23];
1283  step[23] = output[23] + output[22];
1284  step[24] = output[24] + output[25];
1285  step[25] = -output[25] + output[24];
1286  step[26] = -output[26] + output[27];
1287  step[27] = output[27] + output[26];
1288  step[28] = output[28] + output[29];
1289  step[29] = -output[29] + output[28];
1290  step[30] = -output[30] + output[31];
1291  step[31] = output[31] + output[30];
1292
1293  // Final stage --- outputs indices are bit-reversed.
1294  output[0]  = step[0];
1295  output[16] = step[1];
1296  output[8]  = step[2];
1297  output[24] = step[3];
1298  output[4]  = step[4];
1299  output[20] = step[5];
1300  output[12] = step[6];
1301  output[28] = step[7];
1302  output[2]  = step[8];
1303  output[18] = step[9];
1304  output[10] = step[10];
1305  output[26] = step[11];
1306  output[6]  = step[12];
1307  output[22] = step[13];
1308  output[14] = step[14];
1309  output[30] = step[15];
1310
1311  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
1312  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
1313  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
1314  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
1315  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
1316  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
1317  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
1318  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
1319  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
1320  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
1321  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
1322  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
1323  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
1324  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
1325  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
1326  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
1327}
1328
1329void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
1330  int shortpitch = pitch >> 1;
1331  int i, j;
1332  int output[32 * 32];
1333
1334  // Columns
1335  for (i = 0; i < 32; ++i) {
1336    int temp_in[32], temp_out[32];
1337    for (j = 0; j < 32; ++j)
1338      temp_in[j] = input[j * shortpitch + i] << 2;
1339    dct32_1d(temp_in, temp_out, 0);
1340    for (j = 0; j < 32; ++j)
1341      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1342  }
1343
1344  // Rows
1345  for (i = 0; i < 32; ++i) {
1346    int temp_in[32], temp_out[32];
1347    for (j = 0; j < 32; ++j)
1348      temp_in[j] = output[j + i * 32];
1349    dct32_1d(temp_in, temp_out, 0);
1350    for (j = 0; j < 32; ++j)
1351      out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1352  }
1353}
1354
1355// Note that although we use dct_32_round in dct32_1d computation flow,
1356// this 2d fdct32x32 for rate-distortion optimization loop is operating
1357// within 16 bits precision.
1358void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
1359  int shortpitch = pitch >> 1;
1360  int i, j;
1361  int output[32 * 32];
1362
1363  // Columns
1364  for (i = 0; i < 32; ++i) {
1365    int temp_in[32], temp_out[32];
1366    for (j = 0; j < 32; ++j)
1367      temp_in[j] = input[j * shortpitch + i] << 2;
1368    dct32_1d(temp_in, temp_out, 0);
1369    for (j = 0; j < 32; ++j)
1370      // TODO(cd): see quality impact of only doing
1371      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
1372      //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
1373      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1374  }
1375
1376  // Rows
1377  for (i = 0; i < 32; ++i) {
1378    int temp_in[32], temp_out[32];
1379    for (j = 0; j < 32; ++j)
1380      temp_in[j] = output[j + i * 32];
1381    dct32_1d(temp_in, temp_out, 1);
1382    for (j = 0; j < 32; ++j)
1383      out[j + i * 32] = temp_out[j];
1384  }
1385}
1386