1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <math.h>
13
14#include "./vpx_config.h"
15#include "./vp9_rtcd.h"
16#include "vp9/common/vp9_systemdependent.h"
17#include "vp9/common/vp9_blockd.h"
18#include "vp9/common/vp9_common.h"
19#include "vp9/common/vp9_idct.h"
20
21void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
22/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
23   0.5 shifts per pixel. */
24  int i;
25  int16_t output[16];
26  int a1, b1, c1, d1, e1;
27  const int16_t *ip = input;
28  int16_t *op = output;
29
30  for (i = 0; i < 4; i++) {
31    a1 = ip[0] >> UNIT_QUANT_SHIFT;
32    c1 = ip[1] >> UNIT_QUANT_SHIFT;
33    d1 = ip[2] >> UNIT_QUANT_SHIFT;
34    b1 = ip[3] >> UNIT_QUANT_SHIFT;
35    a1 += c1;
36    d1 -= b1;
37    e1 = (a1 - d1) >> 1;
38    b1 = e1 - b1;
39    c1 = e1 - c1;
40    a1 -= b1;
41    d1 += c1;
42    op[0] = a1;
43    op[1] = b1;
44    op[2] = c1;
45    op[3] = d1;
46    ip += 4;
47    op += 4;
48  }
49
50  ip = output;
51  for (i = 0; i < 4; i++) {
52    a1 = ip[4 * 0];
53    c1 = ip[4 * 1];
54    d1 = ip[4 * 2];
55    b1 = ip[4 * 3];
56    a1 += c1;
57    d1 -= b1;
58    e1 = (a1 - d1) >> 1;
59    b1 = e1 - b1;
60    c1 = e1 - c1;
61    a1 -= b1;
62    d1 += c1;
63    dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
64    dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
65    dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
66    dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
67
68    ip++;
69    dest++;
70  }
71}
72
73void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
74  int i;
75  int a1, e1;
76  int16_t tmp[4];
77  const int16_t *ip = in;
78  int16_t *op = tmp;
79
80  a1 = ip[0] >> UNIT_QUANT_SHIFT;
81  e1 = a1 >> 1;
82  a1 -= e1;
83  op[0] = a1;
84  op[1] = op[2] = op[3] = e1;
85
86  ip = tmp;
87  for (i = 0; i < 4; i++) {
88    e1 = ip[0] >> 1;
89    a1 = ip[0] - e1;
90    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
91    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
92    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
93    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
94    ip++;
95    dest++;
96  }
97}
98
99static void idct4(const int16_t *input, int16_t *output) {
100  int16_t step[4];
101  int temp1, temp2;
102  // stage 1
103  temp1 = (input[0] + input[2]) * cospi_16_64;
104  temp2 = (input[0] - input[2]) * cospi_16_64;
105  step[0] = dct_const_round_shift(temp1);
106  step[1] = dct_const_round_shift(temp2);
107  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
108  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
109  step[2] = dct_const_round_shift(temp1);
110  step[3] = dct_const_round_shift(temp2);
111
112  // stage 2
113  output[0] = step[0] + step[3];
114  output[1] = step[1] + step[2];
115  output[2] = step[1] - step[2];
116  output[3] = step[0] - step[3];
117}
118
119void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
120  int16_t out[4 * 4];
121  int16_t *outptr = out;
122  int i, j;
123  int16_t temp_in[4], temp_out[4];
124
125  // Rows
126  for (i = 0; i < 4; ++i) {
127    idct4(input, outptr);
128    input += 4;
129    outptr += 4;
130  }
131
132  // Columns
133  for (i = 0; i < 4; ++i) {
134    for (j = 0; j < 4; ++j)
135      temp_in[j] = out[j * 4 + i];
136    idct4(temp_in, temp_out);
137    for (j = 0; j < 4; ++j)
138      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
139                                  + dest[j * stride + i]);
140  }
141}
142
143void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
144  int i;
145  int a1;
146  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
147  out = dct_const_round_shift(out * cospi_16_64);
148  a1 = ROUND_POWER_OF_TWO(out, 4);
149
150  for (i = 0; i < 4; i++) {
151    dest[0] = clip_pixel(dest[0] + a1);
152    dest[1] = clip_pixel(dest[1] + a1);
153    dest[2] = clip_pixel(dest[2] + a1);
154    dest[3] = clip_pixel(dest[3] + a1);
155    dest += dest_stride;
156  }
157}
158
159static void idct8(const int16_t *input, int16_t *output) {
160  int16_t step1[8], step2[8];
161  int temp1, temp2;
162  // stage 1
163  step1[0] = input[0];
164  step1[2] = input[4];
165  step1[1] = input[2];
166  step1[3] = input[6];
167  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
168  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
169  step1[4] = dct_const_round_shift(temp1);
170  step1[7] = dct_const_round_shift(temp2);
171  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
172  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
173  step1[5] = dct_const_round_shift(temp1);
174  step1[6] = dct_const_round_shift(temp2);
175
176  // stage 2 & stage 3 - even half
177  idct4(step1, step1);
178
179  // stage 2 - odd half
180  step2[4] = step1[4] + step1[5];
181  step2[5] = step1[4] - step1[5];
182  step2[6] = -step1[6] + step1[7];
183  step2[7] = step1[6] + step1[7];
184
185  // stage 3 -odd half
186  step1[4] = step2[4];
187  temp1 = (step2[6] - step2[5]) * cospi_16_64;
188  temp2 = (step2[5] + step2[6]) * cospi_16_64;
189  step1[5] = dct_const_round_shift(temp1);
190  step1[6] = dct_const_round_shift(temp2);
191  step1[7] = step2[7];
192
193  // stage 4
194  output[0] = step1[0] + step1[7];
195  output[1] = step1[1] + step1[6];
196  output[2] = step1[2] + step1[5];
197  output[3] = step1[3] + step1[4];
198  output[4] = step1[3] - step1[4];
199  output[5] = step1[2] - step1[5];
200  output[6] = step1[1] - step1[6];
201  output[7] = step1[0] - step1[7];
202}
203
204void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
205  int16_t out[8 * 8];
206  int16_t *outptr = out;
207  int i, j;
208  int16_t temp_in[8], temp_out[8];
209
210  // First transform rows
211  for (i = 0; i < 8; ++i) {
212    idct8(input, outptr);
213    input += 8;
214    outptr += 8;
215  }
216
217  // Then transform columns
218  for (i = 0; i < 8; ++i) {
219    for (j = 0; j < 8; ++j)
220      temp_in[j] = out[j * 8 + i];
221    idct8(temp_in, temp_out);
222    for (j = 0; j < 8; ++j)
223      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
224                                  + dest[j * stride + i]);
225  }
226}
227
228void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
229  int i, j;
230  int a1;
231  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
232  out = dct_const_round_shift(out * cospi_16_64);
233  a1 = ROUND_POWER_OF_TWO(out, 5);
234  for (j = 0; j < 8; ++j) {
235    for (i = 0; i < 8; ++i)
236      dest[i] = clip_pixel(dest[i] + a1);
237    dest += stride;
238  }
239}
240
241static void iadst4(const int16_t *input, int16_t *output) {
242  int s0, s1, s2, s3, s4, s5, s6, s7;
243
244  int x0 = input[0];
245  int x1 = input[1];
246  int x2 = input[2];
247  int x3 = input[3];
248
249  if (!(x0 | x1 | x2 | x3)) {
250    output[0] = output[1] = output[2] = output[3] = 0;
251    return;
252  }
253
254  s0 = sinpi_1_9 * x0;
255  s1 = sinpi_2_9 * x0;
256  s2 = sinpi_3_9 * x1;
257  s3 = sinpi_4_9 * x2;
258  s4 = sinpi_1_9 * x2;
259  s5 = sinpi_2_9 * x3;
260  s6 = sinpi_4_9 * x3;
261  s7 = x0 - x2 + x3;
262
263  x0 = s0 + s3 + s5;
264  x1 = s1 - s4 - s6;
265  x2 = sinpi_3_9 * s7;
266  x3 = s2;
267
268  s0 = x0 + x3;
269  s1 = x1 + x3;
270  s2 = x2;
271  s3 = x0 + x1 - x3;
272
273  // 1-D transform scaling factor is sqrt(2).
274  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
275  // + 1b (addition) = 29b.
276  // Hence the output bit depth is 15b.
277  output[0] = dct_const_round_shift(s0);
278  output[1] = dct_const_round_shift(s1);
279  output[2] = dct_const_round_shift(s2);
280  output[3] = dct_const_round_shift(s3);
281}
282
283void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
284                         int tx_type) {
285  const transform_2d IHT_4[] = {
286    { idct4, idct4  },  // DCT_DCT  = 0
287    { iadst4, idct4  },   // ADST_DCT = 1
288    { idct4, iadst4 },    // DCT_ADST = 2
289    { iadst4, iadst4 }      // ADST_ADST = 3
290  };
291
292  int i, j;
293  int16_t out[4 * 4];
294  int16_t *outptr = out;
295  int16_t temp_in[4], temp_out[4];
296
297  // inverse transform row vectors
298  for (i = 0; i < 4; ++i) {
299    IHT_4[tx_type].rows(input, outptr);
300    input  += 4;
301    outptr += 4;
302  }
303
304  // inverse transform column vectors
305  for (i = 0; i < 4; ++i) {
306    for (j = 0; j < 4; ++j)
307      temp_in[j] = out[j * 4 + i];
308    IHT_4[tx_type].cols(temp_in, temp_out);
309    for (j = 0; j < 4; ++j)
310      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
311                                  + dest[j * stride + i]);
312  }
313}
314static void iadst8(const int16_t *input, int16_t *output) {
315  int s0, s1, s2, s3, s4, s5, s6, s7;
316
317  int x0 = input[7];
318  int x1 = input[0];
319  int x2 = input[5];
320  int x3 = input[2];
321  int x4 = input[3];
322  int x5 = input[4];
323  int x6 = input[1];
324  int x7 = input[6];
325
326  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
327    output[0] = output[1] = output[2] = output[3] = output[4]
328              = output[5] = output[6] = output[7] = 0;
329    return;
330  }
331
332  // stage 1
333  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
334  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
335  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
336  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
337  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
338  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
339  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
340  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
341
342  x0 = dct_const_round_shift(s0 + s4);
343  x1 = dct_const_round_shift(s1 + s5);
344  x2 = dct_const_round_shift(s2 + s6);
345  x3 = dct_const_round_shift(s3 + s7);
346  x4 = dct_const_round_shift(s0 - s4);
347  x5 = dct_const_round_shift(s1 - s5);
348  x6 = dct_const_round_shift(s2 - s6);
349  x7 = dct_const_round_shift(s3 - s7);
350
351  // stage 2
352  s0 = x0;
353  s1 = x1;
354  s2 = x2;
355  s3 = x3;
356  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
357  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
358  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
359  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
360
361  x0 = s0 + s2;
362  x1 = s1 + s3;
363  x2 = s0 - s2;
364  x3 = s1 - s3;
365  x4 = dct_const_round_shift(s4 + s6);
366  x5 = dct_const_round_shift(s5 + s7);
367  x6 = dct_const_round_shift(s4 - s6);
368  x7 = dct_const_round_shift(s5 - s7);
369
370  // stage 3
371  s2 = cospi_16_64 * (x2 + x3);
372  s3 = cospi_16_64 * (x2 - x3);
373  s6 = cospi_16_64 * (x6 + x7);
374  s7 = cospi_16_64 * (x6 - x7);
375
376  x2 = dct_const_round_shift(s2);
377  x3 = dct_const_round_shift(s3);
378  x6 = dct_const_round_shift(s6);
379  x7 = dct_const_round_shift(s7);
380
381  output[0] =  x0;
382  output[1] = -x4;
383  output[2] =  x6;
384  output[3] = -x2;
385  output[4] =  x3;
386  output[5] = -x7;
387  output[6] =  x5;
388  output[7] = -x1;
389}
390
391static const transform_2d IHT_8[] = {
392  { idct8,  idct8  },  // DCT_DCT  = 0
393  { iadst8, idct8  },  // ADST_DCT = 1
394  { idct8,  iadst8 },  // DCT_ADST = 2
395  { iadst8, iadst8 }   // ADST_ADST = 3
396};
397
398void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
399                         int tx_type) {
400  int i, j;
401  int16_t out[8 * 8];
402  int16_t *outptr = out;
403  int16_t temp_in[8], temp_out[8];
404  const transform_2d ht = IHT_8[tx_type];
405
406  // inverse transform row vectors
407  for (i = 0; i < 8; ++i) {
408    ht.rows(input, outptr);
409    input += 8;
410    outptr += 8;
411  }
412
413  // inverse transform column vectors
414  for (i = 0; i < 8; ++i) {
415    for (j = 0; j < 8; ++j)
416      temp_in[j] = out[j * 8 + i];
417    ht.cols(temp_in, temp_out);
418    for (j = 0; j < 8; ++j)
419      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
420                                  + dest[j * stride + i]);
421  }
422}
423
424void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
425  int16_t out[8 * 8] = { 0 };
426  int16_t *outptr = out;
427  int i, j;
428  int16_t temp_in[8], temp_out[8];
429
430  // First transform rows
431  // only first 4 row has non-zero coefs
432  for (i = 0; i < 4; ++i) {
433    idct8(input, outptr);
434    input += 8;
435    outptr += 8;
436  }
437
438  // Then transform columns
439  for (i = 0; i < 8; ++i) {
440    for (j = 0; j < 8; ++j)
441      temp_in[j] = out[j * 8 + i];
442    idct8(temp_in, temp_out);
443    for (j = 0; j < 8; ++j)
444      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
445                                  + dest[j * stride + i]);
446  }
447}
448
449static void idct16(const int16_t *input, int16_t *output) {
450  int16_t step1[16], step2[16];
451  int temp1, temp2;
452
453  // stage 1
454  step1[0] = input[0/2];
455  step1[1] = input[16/2];
456  step1[2] = input[8/2];
457  step1[3] = input[24/2];
458  step1[4] = input[4/2];
459  step1[5] = input[20/2];
460  step1[6] = input[12/2];
461  step1[7] = input[28/2];
462  step1[8] = input[2/2];
463  step1[9] = input[18/2];
464  step1[10] = input[10/2];
465  step1[11] = input[26/2];
466  step1[12] = input[6/2];
467  step1[13] = input[22/2];
468  step1[14] = input[14/2];
469  step1[15] = input[30/2];
470
471  // stage 2
472  step2[0] = step1[0];
473  step2[1] = step1[1];
474  step2[2] = step1[2];
475  step2[3] = step1[3];
476  step2[4] = step1[4];
477  step2[5] = step1[5];
478  step2[6] = step1[6];
479  step2[7] = step1[7];
480
481  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
482  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
483  step2[8] = dct_const_round_shift(temp1);
484  step2[15] = dct_const_round_shift(temp2);
485
486  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
487  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
488  step2[9] = dct_const_round_shift(temp1);
489  step2[14] = dct_const_round_shift(temp2);
490
491  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
492  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
493  step2[10] = dct_const_round_shift(temp1);
494  step2[13] = dct_const_round_shift(temp2);
495
496  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
497  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
498  step2[11] = dct_const_round_shift(temp1);
499  step2[12] = dct_const_round_shift(temp2);
500
501  // stage 3
502  step1[0] = step2[0];
503  step1[1] = step2[1];
504  step1[2] = step2[2];
505  step1[3] = step2[3];
506
507  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
508  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
509  step1[4] = dct_const_round_shift(temp1);
510  step1[7] = dct_const_round_shift(temp2);
511  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
512  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
513  step1[5] = dct_const_round_shift(temp1);
514  step1[6] = dct_const_round_shift(temp2);
515
516  step1[8] = step2[8] + step2[9];
517  step1[9] = step2[8] - step2[9];
518  step1[10] = -step2[10] + step2[11];
519  step1[11] = step2[10] + step2[11];
520  step1[12] = step2[12] + step2[13];
521  step1[13] = step2[12] - step2[13];
522  step1[14] = -step2[14] + step2[15];
523  step1[15] = step2[14] + step2[15];
524
525  // stage 4
526  temp1 = (step1[0] + step1[1]) * cospi_16_64;
527  temp2 = (step1[0] - step1[1]) * cospi_16_64;
528  step2[0] = dct_const_round_shift(temp1);
529  step2[1] = dct_const_round_shift(temp2);
530  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
531  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
532  step2[2] = dct_const_round_shift(temp1);
533  step2[3] = dct_const_round_shift(temp2);
534  step2[4] = step1[4] + step1[5];
535  step2[5] = step1[4] - step1[5];
536  step2[6] = -step1[6] + step1[7];
537  step2[7] = step1[6] + step1[7];
538
539  step2[8] = step1[8];
540  step2[15] = step1[15];
541  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
542  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
543  step2[9] = dct_const_round_shift(temp1);
544  step2[14] = dct_const_round_shift(temp2);
545  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
546  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
547  step2[10] = dct_const_round_shift(temp1);
548  step2[13] = dct_const_round_shift(temp2);
549  step2[11] = step1[11];
550  step2[12] = step1[12];
551
552  // stage 5
553  step1[0] = step2[0] + step2[3];
554  step1[1] = step2[1] + step2[2];
555  step1[2] = step2[1] - step2[2];
556  step1[3] = step2[0] - step2[3];
557  step1[4] = step2[4];
558  temp1 = (step2[6] - step2[5]) * cospi_16_64;
559  temp2 = (step2[5] + step2[6]) * cospi_16_64;
560  step1[5] = dct_const_round_shift(temp1);
561  step1[6] = dct_const_round_shift(temp2);
562  step1[7] = step2[7];
563
564  step1[8] = step2[8] + step2[11];
565  step1[9] = step2[9] + step2[10];
566  step1[10] = step2[9] - step2[10];
567  step1[11] = step2[8] - step2[11];
568  step1[12] = -step2[12] + step2[15];
569  step1[13] = -step2[13] + step2[14];
570  step1[14] = step2[13] + step2[14];
571  step1[15] = step2[12] + step2[15];
572
573  // stage 6
574  step2[0] = step1[0] + step1[7];
575  step2[1] = step1[1] + step1[6];
576  step2[2] = step1[2] + step1[5];
577  step2[3] = step1[3] + step1[4];
578  step2[4] = step1[3] - step1[4];
579  step2[5] = step1[2] - step1[5];
580  step2[6] = step1[1] - step1[6];
581  step2[7] = step1[0] - step1[7];
582  step2[8] = step1[8];
583  step2[9] = step1[9];
584  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
585  temp2 = (step1[10] + step1[13]) * cospi_16_64;
586  step2[10] = dct_const_round_shift(temp1);
587  step2[13] = dct_const_round_shift(temp2);
588  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
589  temp2 = (step1[11] + step1[12]) * cospi_16_64;
590  step2[11] = dct_const_round_shift(temp1);
591  step2[12] = dct_const_round_shift(temp2);
592  step2[14] = step1[14];
593  step2[15] = step1[15];
594
595  // stage 7
596  output[0] = step2[0] + step2[15];
597  output[1] = step2[1] + step2[14];
598  output[2] = step2[2] + step2[13];
599  output[3] = step2[3] + step2[12];
600  output[4] = step2[4] + step2[11];
601  output[5] = step2[5] + step2[10];
602  output[6] = step2[6] + step2[9];
603  output[7] = step2[7] + step2[8];
604  output[8] = step2[7] - step2[8];
605  output[9] = step2[6] - step2[9];
606  output[10] = step2[5] - step2[10];
607  output[11] = step2[4] - step2[11];
608  output[12] = step2[3] - step2[12];
609  output[13] = step2[2] - step2[13];
610  output[14] = step2[1] - step2[14];
611  output[15] = step2[0] - step2[15];
612}
613
614void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
615  int16_t out[16 * 16];
616  int16_t *outptr = out;
617  int i, j;
618  int16_t temp_in[16], temp_out[16];
619
620  // First transform rows
621  for (i = 0; i < 16; ++i) {
622    idct16(input, outptr);
623    input += 16;
624    outptr += 16;
625  }
626
627  // Then transform columns
628  for (i = 0; i < 16; ++i) {
629    for (j = 0; j < 16; ++j)
630      temp_in[j] = out[j * 16 + i];
631    idct16(temp_in, temp_out);
632    for (j = 0; j < 16; ++j)
633      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
634                                  + dest[j * stride + i]);
635  }
636}
637
638static void iadst16(const int16_t *input, int16_t *output) {
639  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
640
641  int x0 = input[15];
642  int x1 = input[0];
643  int x2 = input[13];
644  int x3 = input[2];
645  int x4 = input[11];
646  int x5 = input[4];
647  int x6 = input[9];
648  int x7 = input[6];
649  int x8 = input[7];
650  int x9 = input[8];
651  int x10 = input[5];
652  int x11 = input[10];
653  int x12 = input[3];
654  int x13 = input[12];
655  int x14 = input[1];
656  int x15 = input[14];
657
658  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
659           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
660    output[0] = output[1] = output[2] = output[3] = output[4]
661              = output[5] = output[6] = output[7] = output[8]
662              = output[9] = output[10] = output[11] = output[12]
663              = output[13] = output[14] = output[15] = 0;
664    return;
665  }
666
667  // stage 1
668  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
669  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
670  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
671  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
672  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
673  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
674  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
675  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
676  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
677  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
678  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
679  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
680  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
681  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
682  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
683  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
684
685  x0 = dct_const_round_shift(s0 + s8);
686  x1 = dct_const_round_shift(s1 + s9);
687  x2 = dct_const_round_shift(s2 + s10);
688  x3 = dct_const_round_shift(s3 + s11);
689  x4 = dct_const_round_shift(s4 + s12);
690  x5 = dct_const_round_shift(s5 + s13);
691  x6 = dct_const_round_shift(s6 + s14);
692  x7 = dct_const_round_shift(s7 + s15);
693  x8  = dct_const_round_shift(s0 - s8);
694  x9  = dct_const_round_shift(s1 - s9);
695  x10 = dct_const_round_shift(s2 - s10);
696  x11 = dct_const_round_shift(s3 - s11);
697  x12 = dct_const_round_shift(s4 - s12);
698  x13 = dct_const_round_shift(s5 - s13);
699  x14 = dct_const_round_shift(s6 - s14);
700  x15 = dct_const_round_shift(s7 - s15);
701
702  // stage 2
703  s0 = x0;
704  s1 = x1;
705  s2 = x2;
706  s3 = x3;
707  s4 = x4;
708  s5 = x5;
709  s6 = x6;
710  s7 = x7;
711  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
712  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
713  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
714  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
715  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
716  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
717  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
718  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
719
720  x0 = s0 + s4;
721  x1 = s1 + s5;
722  x2 = s2 + s6;
723  x3 = s3 + s7;
724  x4 = s0 - s4;
725  x5 = s1 - s5;
726  x6 = s2 - s6;
727  x7 = s3 - s7;
728  x8 = dct_const_round_shift(s8 + s12);
729  x9 = dct_const_round_shift(s9 + s13);
730  x10 = dct_const_round_shift(s10 + s14);
731  x11 = dct_const_round_shift(s11 + s15);
732  x12 = dct_const_round_shift(s8 - s12);
733  x13 = dct_const_round_shift(s9 - s13);
734  x14 = dct_const_round_shift(s10 - s14);
735  x15 = dct_const_round_shift(s11 - s15);
736
737  // stage 3
738  s0 = x0;
739  s1 = x1;
740  s2 = x2;
741  s3 = x3;
742  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
743  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
744  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
745  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
746  s8 = x8;
747  s9 = x9;
748  s10 = x10;
749  s11 = x11;
750  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
751  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
752  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
753  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
754
755  x0 = s0 + s2;
756  x1 = s1 + s3;
757  x2 = s0 - s2;
758  x3 = s1 - s3;
759  x4 = dct_const_round_shift(s4 + s6);
760  x5 = dct_const_round_shift(s5 + s7);
761  x6 = dct_const_round_shift(s4 - s6);
762  x7 = dct_const_round_shift(s5 - s7);
763  x8 = s8 + s10;
764  x9 = s9 + s11;
765  x10 = s8 - s10;
766  x11 = s9 - s11;
767  x12 = dct_const_round_shift(s12 + s14);
768  x13 = dct_const_round_shift(s13 + s15);
769  x14 = dct_const_round_shift(s12 - s14);
770  x15 = dct_const_round_shift(s13 - s15);
771
772  // stage 4
773  s2 = (- cospi_16_64) * (x2 + x3);
774  s3 = cospi_16_64 * (x2 - x3);
775  s6 = cospi_16_64 * (x6 + x7);
776  s7 = cospi_16_64 * (- x6 + x7);
777  s10 = cospi_16_64 * (x10 + x11);
778  s11 = cospi_16_64 * (- x10 + x11);
779  s14 = (- cospi_16_64) * (x14 + x15);
780  s15 = cospi_16_64 * (x14 - x15);
781
782  x2 = dct_const_round_shift(s2);
783  x3 = dct_const_round_shift(s3);
784  x6 = dct_const_round_shift(s6);
785  x7 = dct_const_round_shift(s7);
786  x10 = dct_const_round_shift(s10);
787  x11 = dct_const_round_shift(s11);
788  x14 = dct_const_round_shift(s14);
789  x15 = dct_const_round_shift(s15);
790
791  output[0] =  x0;
792  output[1] = -x8;
793  output[2] =  x12;
794  output[3] = -x4;
795  output[4] =  x6;
796  output[5] =  x14;
797  output[6] =  x10;
798  output[7] =  x2;
799  output[8] =  x3;
800  output[9] =  x11;
801  output[10] =  x15;
802  output[11] =  x7;
803  output[12] =  x5;
804  output[13] = -x13;
805  output[14] =  x9;
806  output[15] = -x1;
807}
808
809static const transform_2d IHT_16[] = {
810  { idct16,  idct16  },  // DCT_DCT  = 0
811  { iadst16, idct16  },  // ADST_DCT = 1
812  { idct16,  iadst16 },  // DCT_ADST = 2
813  { iadst16, iadst16 }   // ADST_ADST = 3
814};
815
816void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
817                            int tx_type) {
818  int i, j;
819  int16_t out[16 * 16];
820  int16_t *outptr = out;
821  int16_t temp_in[16], temp_out[16];
822  const transform_2d ht = IHT_16[tx_type];
823
824  // Rows
825  for (i = 0; i < 16; ++i) {
826    ht.rows(input, outptr);
827    input += 16;
828    outptr += 16;
829  }
830
831  // Columns
832  for (i = 0; i < 16; ++i) {
833    for (j = 0; j < 16; ++j)
834      temp_in[j] = out[j * 16 + i];
835    ht.cols(temp_in, temp_out);
836    for (j = 0; j < 16; ++j)
837      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
838                                        + dest[j * stride + i]);
839  }
840}
841
842void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
843  int16_t out[16 * 16] = { 0 };
844  int16_t *outptr = out;
845  int i, j;
846  int16_t temp_in[16], temp_out[16];
847
848  // First transform rows. Since all non-zero dct coefficients are in
849  // upper-left 4x4 area, we only need to calculate first 4 rows here.
850  for (i = 0; i < 4; ++i) {
851    idct16(input, outptr);
852    input += 16;
853    outptr += 16;
854  }
855
856  // Then transform columns
857  for (i = 0; i < 16; ++i) {
858    for (j = 0; j < 16; ++j)
859      temp_in[j] = out[j*16 + i];
860    idct16(temp_in, temp_out);
861    for (j = 0; j < 16; ++j)
862      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
863                                  + dest[j * stride + i]);
864  }
865}
866
867void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
868  int i, j;
869  int a1;
870  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
871  out = dct_const_round_shift(out * cospi_16_64);
872  a1 = ROUND_POWER_OF_TWO(out, 6);
873  for (j = 0; j < 16; ++j) {
874    for (i = 0; i < 16; ++i)
875      dest[i] = clip_pixel(dest[i] + a1);
876    dest += stride;
877  }
878}
879
880static void idct32(const int16_t *input, int16_t *output) {
881  int16_t step1[32], step2[32];
882  int temp1, temp2;
883
884  // stage 1
885  step1[0] = input[0];
886  step1[1] = input[16];
887  step1[2] = input[8];
888  step1[3] = input[24];
889  step1[4] = input[4];
890  step1[5] = input[20];
891  step1[6] = input[12];
892  step1[7] = input[28];
893  step1[8] = input[2];
894  step1[9] = input[18];
895  step1[10] = input[10];
896  step1[11] = input[26];
897  step1[12] = input[6];
898  step1[13] = input[22];
899  step1[14] = input[14];
900  step1[15] = input[30];
901
902  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
903  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
904  step1[16] = dct_const_round_shift(temp1);
905  step1[31] = dct_const_round_shift(temp2);
906
907  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
908  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
909  step1[17] = dct_const_round_shift(temp1);
910  step1[30] = dct_const_round_shift(temp2);
911
912  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
913  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
914  step1[18] = dct_const_round_shift(temp1);
915  step1[29] = dct_const_round_shift(temp2);
916
917  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
918  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
919  step1[19] = dct_const_round_shift(temp1);
920  step1[28] = dct_const_round_shift(temp2);
921
922  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
923  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
924  step1[20] = dct_const_round_shift(temp1);
925  step1[27] = dct_const_round_shift(temp2);
926
927  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
928  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
929  step1[21] = dct_const_round_shift(temp1);
930  step1[26] = dct_const_round_shift(temp2);
931
932  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
933  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
934  step1[22] = dct_const_round_shift(temp1);
935  step1[25] = dct_const_round_shift(temp2);
936
937  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
938  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
939  step1[23] = dct_const_round_shift(temp1);
940  step1[24] = dct_const_round_shift(temp2);
941
942  // stage 2
943  step2[0] = step1[0];
944  step2[1] = step1[1];
945  step2[2] = step1[2];
946  step2[3] = step1[3];
947  step2[4] = step1[4];
948  step2[5] = step1[5];
949  step2[6] = step1[6];
950  step2[7] = step1[7];
951
952  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
953  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
954  step2[8] = dct_const_round_shift(temp1);
955  step2[15] = dct_const_round_shift(temp2);
956
957  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
958  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
959  step2[9] = dct_const_round_shift(temp1);
960  step2[14] = dct_const_round_shift(temp2);
961
962  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
963  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
964  step2[10] = dct_const_round_shift(temp1);
965  step2[13] = dct_const_round_shift(temp2);
966
967  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
968  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
969  step2[11] = dct_const_round_shift(temp1);
970  step2[12] = dct_const_round_shift(temp2);
971
972  step2[16] = step1[16] + step1[17];
973  step2[17] = step1[16] - step1[17];
974  step2[18] = -step1[18] + step1[19];
975  step2[19] = step1[18] + step1[19];
976  step2[20] = step1[20] + step1[21];
977  step2[21] = step1[20] - step1[21];
978  step2[22] = -step1[22] + step1[23];
979  step2[23] = step1[22] + step1[23];
980  step2[24] = step1[24] + step1[25];
981  step2[25] = step1[24] - step1[25];
982  step2[26] = -step1[26] + step1[27];
983  step2[27] = step1[26] + step1[27];
984  step2[28] = step1[28] + step1[29];
985  step2[29] = step1[28] - step1[29];
986  step2[30] = -step1[30] + step1[31];
987  step2[31] = step1[30] + step1[31];
988
989  // stage 3
990  step1[0] = step2[0];
991  step1[1] = step2[1];
992  step1[2] = step2[2];
993  step1[3] = step2[3];
994
995  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
996  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
997  step1[4] = dct_const_round_shift(temp1);
998  step1[7] = dct_const_round_shift(temp2);
999  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1000  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1001  step1[5] = dct_const_round_shift(temp1);
1002  step1[6] = dct_const_round_shift(temp2);
1003
1004  step1[8] = step2[8] + step2[9];
1005  step1[9] = step2[8] - step2[9];
1006  step1[10] = -step2[10] + step2[11];
1007  step1[11] = step2[10] + step2[11];
1008  step1[12] = step2[12] + step2[13];
1009  step1[13] = step2[12] - step2[13];
1010  step1[14] = -step2[14] + step2[15];
1011  step1[15] = step2[14] + step2[15];
1012
1013  step1[16] = step2[16];
1014  step1[31] = step2[31];
1015  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
1016  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
1017  step1[17] = dct_const_round_shift(temp1);
1018  step1[30] = dct_const_round_shift(temp2);
1019  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
1020  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
1021  step1[18] = dct_const_round_shift(temp1);
1022  step1[29] = dct_const_round_shift(temp2);
1023  step1[19] = step2[19];
1024  step1[20] = step2[20];
1025  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
1026  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
1027  step1[21] = dct_const_round_shift(temp1);
1028  step1[26] = dct_const_round_shift(temp2);
1029  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
1030  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
1031  step1[22] = dct_const_round_shift(temp1);
1032  step1[25] = dct_const_round_shift(temp2);
1033  step1[23] = step2[23];
1034  step1[24] = step2[24];
1035  step1[27] = step2[27];
1036  step1[28] = step2[28];
1037
1038  // stage 4
1039  temp1 = (step1[0] + step1[1]) * cospi_16_64;
1040  temp2 = (step1[0] - step1[1]) * cospi_16_64;
1041  step2[0] = dct_const_round_shift(temp1);
1042  step2[1] = dct_const_round_shift(temp2);
1043  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1044  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1045  step2[2] = dct_const_round_shift(temp1);
1046  step2[3] = dct_const_round_shift(temp2);
1047  step2[4] = step1[4] + step1[5];
1048  step2[5] = step1[4] - step1[5];
1049  step2[6] = -step1[6] + step1[7];
1050  step2[7] = step1[6] + step1[7];
1051
1052  step2[8] = step1[8];
1053  step2[15] = step1[15];
1054  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1055  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1056  step2[9] = dct_const_round_shift(temp1);
1057  step2[14] = dct_const_round_shift(temp2);
1058  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1059  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1060  step2[10] = dct_const_round_shift(temp1);
1061  step2[13] = dct_const_round_shift(temp2);
1062  step2[11] = step1[11];
1063  step2[12] = step1[12];
1064
1065  step2[16] = step1[16] + step1[19];
1066  step2[17] = step1[17] + step1[18];
1067  step2[18] = step1[17] - step1[18];
1068  step2[19] = step1[16] - step1[19];
1069  step2[20] = -step1[20] + step1[23];
1070  step2[21] = -step1[21] + step1[22];
1071  step2[22] = step1[21] + step1[22];
1072  step2[23] = step1[20] + step1[23];
1073
1074  step2[24] = step1[24] + step1[27];
1075  step2[25] = step1[25] + step1[26];
1076  step2[26] = step1[25] - step1[26];
1077  step2[27] = step1[24] - step1[27];
1078  step2[28] = -step1[28] + step1[31];
1079  step2[29] = -step1[29] + step1[30];
1080  step2[30] = step1[29] + step1[30];
1081  step2[31] = step1[28] + step1[31];
1082
1083  // stage 5
1084  step1[0] = step2[0] + step2[3];
1085  step1[1] = step2[1] + step2[2];
1086  step1[2] = step2[1] - step2[2];
1087  step1[3] = step2[0] - step2[3];
1088  step1[4] = step2[4];
1089  temp1 = (step2[6] - step2[5]) * cospi_16_64;
1090  temp2 = (step2[5] + step2[6]) * cospi_16_64;
1091  step1[5] = dct_const_round_shift(temp1);
1092  step1[6] = dct_const_round_shift(temp2);
1093  step1[7] = step2[7];
1094
1095  step1[8] = step2[8] + step2[11];
1096  step1[9] = step2[9] + step2[10];
1097  step1[10] = step2[9] - step2[10];
1098  step1[11] = step2[8] - step2[11];
1099  step1[12] = -step2[12] + step2[15];
1100  step1[13] = -step2[13] + step2[14];
1101  step1[14] = step2[13] + step2[14];
1102  step1[15] = step2[12] + step2[15];
1103
1104  step1[16] = step2[16];
1105  step1[17] = step2[17];
1106  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1107  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1108  step1[18] = dct_const_round_shift(temp1);
1109  step1[29] = dct_const_round_shift(temp2);
1110  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1111  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1112  step1[19] = dct_const_round_shift(temp1);
1113  step1[28] = dct_const_round_shift(temp2);
1114  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1115  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1116  step1[20] = dct_const_round_shift(temp1);
1117  step1[27] = dct_const_round_shift(temp2);
1118  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1119  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1120  step1[21] = dct_const_round_shift(temp1);
1121  step1[26] = dct_const_round_shift(temp2);
1122  step1[22] = step2[22];
1123  step1[23] = step2[23];
1124  step1[24] = step2[24];
1125  step1[25] = step2[25];
1126  step1[30] = step2[30];
1127  step1[31] = step2[31];
1128
1129  // stage 6
1130  step2[0] = step1[0] + step1[7];
1131  step2[1] = step1[1] + step1[6];
1132  step2[2] = step1[2] + step1[5];
1133  step2[3] = step1[3] + step1[4];
1134  step2[4] = step1[3] - step1[4];
1135  step2[5] = step1[2] - step1[5];
1136  step2[6] = step1[1] - step1[6];
1137  step2[7] = step1[0] - step1[7];
1138  step2[8] = step1[8];
1139  step2[9] = step1[9];
1140  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1141  temp2 = (step1[10] + step1[13]) * cospi_16_64;
1142  step2[10] = dct_const_round_shift(temp1);
1143  step2[13] = dct_const_round_shift(temp2);
1144  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1145  temp2 = (step1[11] + step1[12]) * cospi_16_64;
1146  step2[11] = dct_const_round_shift(temp1);
1147  step2[12] = dct_const_round_shift(temp2);
1148  step2[14] = step1[14];
1149  step2[15] = step1[15];
1150
1151  step2[16] = step1[16] + step1[23];
1152  step2[17] = step1[17] + step1[22];
1153  step2[18] = step1[18] + step1[21];
1154  step2[19] = step1[19] + step1[20];
1155  step2[20] = step1[19] - step1[20];
1156  step2[21] = step1[18] - step1[21];
1157  step2[22] = step1[17] - step1[22];
1158  step2[23] = step1[16] - step1[23];
1159
1160  step2[24] = -step1[24] + step1[31];
1161  step2[25] = -step1[25] + step1[30];
1162  step2[26] = -step1[26] + step1[29];
1163  step2[27] = -step1[27] + step1[28];
1164  step2[28] = step1[27] + step1[28];
1165  step2[29] = step1[26] + step1[29];
1166  step2[30] = step1[25] + step1[30];
1167  step2[31] = step1[24] + step1[31];
1168
1169  // stage 7
1170  step1[0] = step2[0] + step2[15];
1171  step1[1] = step2[1] + step2[14];
1172  step1[2] = step2[2] + step2[13];
1173  step1[3] = step2[3] + step2[12];
1174  step1[4] = step2[4] + step2[11];
1175  step1[5] = step2[5] + step2[10];
1176  step1[6] = step2[6] + step2[9];
1177  step1[7] = step2[7] + step2[8];
1178  step1[8] = step2[7] - step2[8];
1179  step1[9] = step2[6] - step2[9];
1180  step1[10] = step2[5] - step2[10];
1181  step1[11] = step2[4] - step2[11];
1182  step1[12] = step2[3] - step2[12];
1183  step1[13] = step2[2] - step2[13];
1184  step1[14] = step2[1] - step2[14];
1185  step1[15] = step2[0] - step2[15];
1186
1187  step1[16] = step2[16];
1188  step1[17] = step2[17];
1189  step1[18] = step2[18];
1190  step1[19] = step2[19];
1191  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1192  temp2 = (step2[20] + step2[27]) * cospi_16_64;
1193  step1[20] = dct_const_round_shift(temp1);
1194  step1[27] = dct_const_round_shift(temp2);
1195  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1196  temp2 = (step2[21] + step2[26]) * cospi_16_64;
1197  step1[21] = dct_const_round_shift(temp1);
1198  step1[26] = dct_const_round_shift(temp2);
1199  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1200  temp2 = (step2[22] + step2[25]) * cospi_16_64;
1201  step1[22] = dct_const_round_shift(temp1);
1202  step1[25] = dct_const_round_shift(temp2);
1203  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1204  temp2 = (step2[23] + step2[24]) * cospi_16_64;
1205  step1[23] = dct_const_round_shift(temp1);
1206  step1[24] = dct_const_round_shift(temp2);
1207  step1[28] = step2[28];
1208  step1[29] = step2[29];
1209  step1[30] = step2[30];
1210  step1[31] = step2[31];
1211
1212  // final stage
1213  output[0] = step1[0] + step1[31];
1214  output[1] = step1[1] + step1[30];
1215  output[2] = step1[2] + step1[29];
1216  output[3] = step1[3] + step1[28];
1217  output[4] = step1[4] + step1[27];
1218  output[5] = step1[5] + step1[26];
1219  output[6] = step1[6] + step1[25];
1220  output[7] = step1[7] + step1[24];
1221  output[8] = step1[8] + step1[23];
1222  output[9] = step1[9] + step1[22];
1223  output[10] = step1[10] + step1[21];
1224  output[11] = step1[11] + step1[20];
1225  output[12] = step1[12] + step1[19];
1226  output[13] = step1[13] + step1[18];
1227  output[14] = step1[14] + step1[17];
1228  output[15] = step1[15] + step1[16];
1229  output[16] = step1[15] - step1[16];
1230  output[17] = step1[14] - step1[17];
1231  output[18] = step1[13] - step1[18];
1232  output[19] = step1[12] - step1[19];
1233  output[20] = step1[11] - step1[20];
1234  output[21] = step1[10] - step1[21];
1235  output[22] = step1[9] - step1[22];
1236  output[23] = step1[8] - step1[23];
1237  output[24] = step1[7] - step1[24];
1238  output[25] = step1[6] - step1[25];
1239  output[26] = step1[5] - step1[26];
1240  output[27] = step1[4] - step1[27];
1241  output[28] = step1[3] - step1[28];
1242  output[29] = step1[2] - step1[29];
1243  output[30] = step1[1] - step1[30];
1244  output[31] = step1[0] - step1[31];
1245}
1246
1247void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
1248  int16_t out[32 * 32];
1249  int16_t *outptr = out;
1250  int i, j;
1251  int16_t temp_in[32], temp_out[32];
1252
1253  // Rows
1254  for (i = 0; i < 32; ++i) {
1255    int16_t zero_coeff[16];
1256    for (j = 0; j < 16; ++j)
1257      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1258    for (j = 0; j < 8; ++j)
1259      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1260    for (j = 0; j < 4; ++j)
1261      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1262    for (j = 0; j < 2; ++j)
1263      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1264
1265    if (zero_coeff[0] | zero_coeff[1])
1266      idct32(input, outptr);
1267    else
1268      vpx_memset(outptr, 0, sizeof(int16_t) * 32);
1269    input += 32;
1270    outptr += 32;
1271  }
1272
1273  // Columns
1274  for (i = 0; i < 32; ++i) {
1275    for (j = 0; j < 32; ++j)
1276      temp_in[j] = out[j * 32 + i];
1277    idct32(temp_in, temp_out);
1278    for (j = 0; j < 32; ++j)
1279      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1280                                        + dest[j * stride + i]);
1281  }
1282}
1283
1284void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
1285  int16_t out[32 * 32] = {0};
1286  int16_t *outptr = out;
1287  int i, j;
1288  int16_t temp_in[32], temp_out[32];
1289
1290  // Rows
1291  // only upper-left 8x8 has non-zero coeff
1292  for (i = 0; i < 8; ++i) {
1293    idct32(input, outptr);
1294    input += 32;
1295    outptr += 32;
1296  }
1297
1298  // Columns
1299  for (i = 0; i < 32; ++i) {
1300    for (j = 0; j < 32; ++j)
1301      temp_in[j] = out[j * 32 + i];
1302    idct32(temp_in, temp_out);
1303    for (j = 0; j < 32; ++j)
1304      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1305                                  + dest[j * stride + i]);
1306  }
1307}
1308
1309void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
1310  int i, j;
1311  int a1;
1312
1313  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
1314  out = dct_const_round_shift(out * cospi_16_64);
1315  a1 = ROUND_POWER_OF_TWO(out, 6);
1316
1317  for (j = 0; j < 32; ++j) {
1318    for (i = 0; i < 32; ++i)
1319      dest[i] = clip_pixel(dest[i] + a1);
1320    dest += stride;
1321  }
1322}
1323
1324// idct
1325void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1326  if (eob > 1)
1327    vp9_idct4x4_16_add(input, dest, stride);
1328  else
1329    vp9_idct4x4_1_add(input, dest, stride);
1330}
1331
1332
1333void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1334  if (eob > 1)
1335    vp9_iwht4x4_16_add(input, dest, stride);
1336  else
1337    vp9_iwht4x4_1_add(input, dest, stride);
1338}
1339
1340void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1341  // If dc is 1, then input[0] is the reconstructed value, do not need
1342  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
1343
1344  // The calculation can be simplified if there are not many non-zero dct
1345  // coefficients. Use eobs to decide what to do.
1346  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
1347  // Combine that with code here.
1348  if (eob == 1)
1349    // DC only DCT coefficient
1350    vp9_idct8x8_1_add(input, dest, stride);
1351  else if (eob <= 10)
1352    vp9_idct8x8_10_add(input, dest, stride);
1353  else
1354    vp9_idct8x8_64_add(input, dest, stride);
1355}
1356
1357void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
1358                       int eob) {
1359  /* The calculation can be simplified if there are not many non-zero dct
1360   * coefficients. Use eobs to separate different cases. */
1361  if (eob == 1)
1362    /* DC only DCT coefficient. */
1363    vp9_idct16x16_1_add(input, dest, stride);
1364  else if (eob <= 10)
1365    vp9_idct16x16_10_add(input, dest, stride);
1366  else
1367    vp9_idct16x16_256_add(input, dest, stride);
1368}
1369
1370void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
1371                       int eob) {
1372  if (eob == 1)
1373    vp9_idct32x32_1_add(input, dest, stride);
1374  else if (eob <= 34)
1375    // non-zero coeff only in upper-left 8x8
1376    vp9_idct32x32_34_add(input, dest, stride);
1377  else
1378    vp9_idct32x32_1024_add(input, dest, stride);
1379}
1380
1381// iht
1382void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1383                    int stride, int eob) {
1384  if (tx_type == DCT_DCT)
1385    vp9_idct4x4_add(input, dest, stride, eob);
1386  else
1387    vp9_iht4x4_16_add(input, dest, stride, tx_type);
1388}
1389
1390void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1391                    int stride, int eob) {
1392  if (tx_type == DCT_DCT) {
1393    vp9_idct8x8_add(input, dest, stride, eob);
1394  } else {
1395    vp9_iht8x8_64_add(input, dest, stride, tx_type);
1396  }
1397}
1398
1399void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1400                      int stride, int eob) {
1401  if (tx_type == DCT_DCT) {
1402    vp9_idct16x16_add(input, dest, stride, eob);
1403  } else {
1404    vp9_iht16x16_256_add(input, dest, stride, tx_type);
1405  }
1406}
1407