1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <math.h>
12#include <string.h>
13
14#include "vpx_dsp/inv_txfm.h"
15
16void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
17/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
18   0.5 shifts per pixel. */
19  int i;
20  tran_low_t output[16];
21  tran_high_t a1, b1, c1, d1, e1;
22  const tran_low_t *ip = input;
23  tran_low_t *op = output;
24
25  for (i = 0; i < 4; i++) {
26    a1 = ip[0] >> UNIT_QUANT_SHIFT;
27    c1 = ip[1] >> UNIT_QUANT_SHIFT;
28    d1 = ip[2] >> UNIT_QUANT_SHIFT;
29    b1 = ip[3] >> UNIT_QUANT_SHIFT;
30    a1 += c1;
31    d1 -= b1;
32    e1 = (a1 - d1) >> 1;
33    b1 = e1 - b1;
34    c1 = e1 - c1;
35    a1 -= b1;
36    d1 += c1;
37    op[0] = WRAPLOW(a1, 8);
38    op[1] = WRAPLOW(b1, 8);
39    op[2] = WRAPLOW(c1, 8);
40    op[3] = WRAPLOW(d1, 8);
41    ip += 4;
42    op += 4;
43  }
44
45  ip = output;
46  for (i = 0; i < 4; i++) {
47    a1 = ip[4 * 0];
48    c1 = ip[4 * 1];
49    d1 = ip[4 * 2];
50    b1 = ip[4 * 3];
51    a1 += c1;
52    d1 -= b1;
53    e1 = (a1 - d1) >> 1;
54    b1 = e1 - b1;
55    c1 = e1 - c1;
56    a1 -= b1;
57    d1 += c1;
58    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
59    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
60    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
61    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
62
63    ip++;
64    dest++;
65  }
66}
67
68void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
69  int i;
70  tran_high_t a1, e1;
71  tran_low_t tmp[4];
72  const tran_low_t *ip = in;
73  tran_low_t *op = tmp;
74
75  a1 = ip[0] >> UNIT_QUANT_SHIFT;
76  e1 = a1 >> 1;
77  a1 -= e1;
78  op[0] = WRAPLOW(a1, 8);
79  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
80
81  ip = tmp;
82  for (i = 0; i < 4; i++) {
83    e1 = ip[0] >> 1;
84    a1 = ip[0] - e1;
85    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
86    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
87    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
88    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
89    ip++;
90    dest++;
91  }
92}
93
94void idct4_c(const tran_low_t *input, tran_low_t *output) {
95  tran_low_t step[4];
96  tran_high_t temp1, temp2;
97  // stage 1
98  temp1 = (input[0] + input[2]) * cospi_16_64;
99  temp2 = (input[0] - input[2]) * cospi_16_64;
100  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
101  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
102  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
103  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
104  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
105  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
106
107  // stage 2
108  output[0] = WRAPLOW(step[0] + step[3], 8);
109  output[1] = WRAPLOW(step[1] + step[2], 8);
110  output[2] = WRAPLOW(step[1] - step[2], 8);
111  output[3] = WRAPLOW(step[0] - step[3], 8);
112}
113
114void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
115  tran_low_t out[4 * 4];
116  tran_low_t *outptr = out;
117  int i, j;
118  tran_low_t temp_in[4], temp_out[4];
119
120  // Rows
121  for (i = 0; i < 4; ++i) {
122    idct4_c(input, outptr);
123    input += 4;
124    outptr += 4;
125  }
126
127  // Columns
128  for (i = 0; i < 4; ++i) {
129    for (j = 0; j < 4; ++j)
130      temp_in[j] = out[j * 4 + i];
131    idct4_c(temp_in, temp_out);
132    for (j = 0; j < 4; ++j) {
133      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
134                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
135    }
136  }
137}
138
139void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
140                         int dest_stride) {
141  int i;
142  tran_high_t a1;
143  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
144  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
145  a1 = ROUND_POWER_OF_TWO(out, 4);
146
147  for (i = 0; i < 4; i++) {
148    dest[0] = clip_pixel_add(dest[0], a1);
149    dest[1] = clip_pixel_add(dest[1], a1);
150    dest[2] = clip_pixel_add(dest[2], a1);
151    dest[3] = clip_pixel_add(dest[3], a1);
152    dest += dest_stride;
153  }
154}
155
156void idct8_c(const tran_low_t *input, tran_low_t *output) {
157  tran_low_t step1[8], step2[8];
158  tran_high_t temp1, temp2;
159  // stage 1
160  step1[0] = input[0];
161  step1[2] = input[4];
162  step1[1] = input[2];
163  step1[3] = input[6];
164  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
165  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
166  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
167  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
168  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
169  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
170  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
171  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
172
173  // stage 2 & stage 3 - even half
174  idct4_c(step1, step1);
175
176  // stage 2 - odd half
177  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
178  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
179  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
180  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
181
182  // stage 3 -odd half
183  step1[4] = step2[4];
184  temp1 = (step2[6] - step2[5]) * cospi_16_64;
185  temp2 = (step2[5] + step2[6]) * cospi_16_64;
186  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
187  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
188  step1[7] = step2[7];
189
190  // stage 4
191  output[0] = WRAPLOW(step1[0] + step1[7], 8);
192  output[1] = WRAPLOW(step1[1] + step1[6], 8);
193  output[2] = WRAPLOW(step1[2] + step1[5], 8);
194  output[3] = WRAPLOW(step1[3] + step1[4], 8);
195  output[4] = WRAPLOW(step1[3] - step1[4], 8);
196  output[5] = WRAPLOW(step1[2] - step1[5], 8);
197  output[6] = WRAPLOW(step1[1] - step1[6], 8);
198  output[7] = WRAPLOW(step1[0] - step1[7], 8);
199}
200
201void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
202  tran_low_t out[8 * 8];
203  tran_low_t *outptr = out;
204  int i, j;
205  tran_low_t temp_in[8], temp_out[8];
206
207  // First transform rows
208  for (i = 0; i < 8; ++i) {
209    idct8_c(input, outptr);
210    input += 8;
211    outptr += 8;
212  }
213
214  // Then transform columns
215  for (i = 0; i < 8; ++i) {
216    for (j = 0; j < 8; ++j)
217      temp_in[j] = out[j * 8 + i];
218    idct8_c(temp_in, temp_out);
219    for (j = 0; j < 8; ++j) {
220      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
221                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
222    }
223  }
224}
225
226void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
227  int i, j;
228  tran_high_t a1;
229  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
230  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
231  a1 = ROUND_POWER_OF_TWO(out, 5);
232  for (j = 0; j < 8; ++j) {
233    for (i = 0; i < 8; ++i)
234      dest[i] = clip_pixel_add(dest[i], a1);
235    dest += stride;
236  }
237}
238
239void iadst4_c(const tran_low_t *input, tran_low_t *output) {
240  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
241
242  tran_low_t x0 = input[0];
243  tran_low_t x1 = input[1];
244  tran_low_t x2 = input[2];
245  tran_low_t x3 = input[3];
246
247  if (!(x0 | x1 | x2 | x3)) {
248    output[0] = output[1] = output[2] = output[3] = 0;
249    return;
250  }
251
252  s0 = sinpi_1_9 * x0;
253  s1 = sinpi_2_9 * x0;
254  s2 = sinpi_3_9 * x1;
255  s3 = sinpi_4_9 * x2;
256  s4 = sinpi_1_9 * x2;
257  s5 = sinpi_2_9 * x3;
258  s6 = sinpi_4_9 * x3;
259  s7 = x0 - x2 + x3;
260
261  s0 = s0 + s3 + s5;
262  s1 = s1 - s4 - s6;
263  s3 = s2;
264  s2 = sinpi_3_9 * s7;
265
266  // 1-D transform scaling factor is sqrt(2).
267  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
268  // + 1b (addition) = 29b.
269  // Hence the output bit depth is 15b.
270  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
271  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
272  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
273  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
274}
275
276void iadst8_c(const tran_low_t *input, tran_low_t *output) {
277  int s0, s1, s2, s3, s4, s5, s6, s7;
278
279  tran_high_t x0 = input[7];
280  tran_high_t x1 = input[0];
281  tran_high_t x2 = input[5];
282  tran_high_t x3 = input[2];
283  tran_high_t x4 = input[3];
284  tran_high_t x5 = input[4];
285  tran_high_t x6 = input[1];
286  tran_high_t x7 = input[6];
287
288  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
289    output[0] = output[1] = output[2] = output[3] = output[4]
290              = output[5] = output[6] = output[7] = 0;
291    return;
292  }
293
294  // stage 1
295  s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);
296  s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);
297  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
298  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
299  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
300  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
301  s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
302  s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
303
304  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
305  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
306  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
307  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
308  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
309  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
310  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
311  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
312
313  // stage 2
314  s0 = (int)x0;
315  s1 = (int)x1;
316  s2 = (int)x2;
317  s3 = (int)x3;
318  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
319  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
320  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
321  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
322
323  x0 = WRAPLOW(s0 + s2, 8);
324  x1 = WRAPLOW(s1 + s3, 8);
325  x2 = WRAPLOW(s0 - s2, 8);
326  x3 = WRAPLOW(s1 - s3, 8);
327  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
328  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
329  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
330  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
331
332  // stage 3
333  s2 = (int)(cospi_16_64 * (x2 + x3));
334  s3 = (int)(cospi_16_64 * (x2 - x3));
335  s6 = (int)(cospi_16_64 * (x6 + x7));
336  s7 = (int)(cospi_16_64 * (x6 - x7));
337
338  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
339  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
340  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
341  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
342
343  output[0] = WRAPLOW(x0, 8);
344  output[1] = WRAPLOW(-x4, 8);
345  output[2] = WRAPLOW(x6, 8);
346  output[3] = WRAPLOW(-x2, 8);
347  output[4] = WRAPLOW(x3, 8);
348  output[5] = WRAPLOW(-x7, 8);
349  output[6] = WRAPLOW(x5, 8);
350  output[7] = WRAPLOW(-x1, 8);
351}
352
353void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
354  tran_low_t out[8 * 8] = { 0 };
355  tran_low_t *outptr = out;
356  int i, j;
357  tran_low_t temp_in[8], temp_out[8];
358
359  // First transform rows
360  // only first 4 row has non-zero coefs
361  for (i = 0; i < 4; ++i) {
362    idct8_c(input, outptr);
363    input += 8;
364    outptr += 8;
365  }
366
367  // Then transform columns
368  for (i = 0; i < 8; ++i) {
369    for (j = 0; j < 8; ++j)
370      temp_in[j] = out[j * 8 + i];
371    idct8_c(temp_in, temp_out);
372    for (j = 0; j < 8; ++j) {
373      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
374                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
375    }
376  }
377}
378
379void idct16_c(const tran_low_t *input, tran_low_t *output) {
380  tran_low_t step1[16], step2[16];
381  tran_high_t temp1, temp2;
382
383  // stage 1
384  step1[0] = input[0/2];
385  step1[1] = input[16/2];
386  step1[2] = input[8/2];
387  step1[3] = input[24/2];
388  step1[4] = input[4/2];
389  step1[5] = input[20/2];
390  step1[6] = input[12/2];
391  step1[7] = input[28/2];
392  step1[8] = input[2/2];
393  step1[9] = input[18/2];
394  step1[10] = input[10/2];
395  step1[11] = input[26/2];
396  step1[12] = input[6/2];
397  step1[13] = input[22/2];
398  step1[14] = input[14/2];
399  step1[15] = input[30/2];
400
401  // stage 2
402  step2[0] = step1[0];
403  step2[1] = step1[1];
404  step2[2] = step1[2];
405  step2[3] = step1[3];
406  step2[4] = step1[4];
407  step2[5] = step1[5];
408  step2[6] = step1[6];
409  step2[7] = step1[7];
410
411  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
412  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
413  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
414  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
415
416  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
417  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
418  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
419  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
420
421  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
422  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
423  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
424  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
425
426  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
427  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
428  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
429  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
430
431  // stage 3
432  step1[0] = step2[0];
433  step1[1] = step2[1];
434  step1[2] = step2[2];
435  step1[3] = step2[3];
436
437  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
438  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
439  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
440  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
441  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
442  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
443  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
444  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
445
446  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
447  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
448  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
449  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
450  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
451  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
452  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
453  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
454
455  // stage 4
456  temp1 = (step1[0] + step1[1]) * cospi_16_64;
457  temp2 = (step1[0] - step1[1]) * cospi_16_64;
458  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
459  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
460  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
461  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
462  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
463  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
464  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
465  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
466  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
467  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
468
469  step2[8] = step1[8];
470  step2[15] = step1[15];
471  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
472  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
473  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
474  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
475  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
476  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
477  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
478  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
479  step2[11] = step1[11];
480  step2[12] = step1[12];
481
482  // stage 5
483  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
484  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
485  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
486  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
487  step1[4] = step2[4];
488  temp1 = (step2[6] - step2[5]) * cospi_16_64;
489  temp2 = (step2[5] + step2[6]) * cospi_16_64;
490  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
491  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
492  step1[7] = step2[7];
493
494  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
495  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
496  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
497  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
498  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
499  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
500  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
501  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
502
503  // stage 6
504  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
505  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
506  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
507  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
508  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
509  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
510  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
511  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
512  step2[8] = step1[8];
513  step2[9] = step1[9];
514  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
515  temp2 = (step1[10] + step1[13]) * cospi_16_64;
516  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
517  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
518  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
519  temp2 = (step1[11] + step1[12]) * cospi_16_64;
520  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
521  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
522  step2[14] = step1[14];
523  step2[15] = step1[15];
524
525  // stage 7
526  output[0] = WRAPLOW(step2[0] + step2[15], 8);
527  output[1] = WRAPLOW(step2[1] + step2[14], 8);
528  output[2] = WRAPLOW(step2[2] + step2[13], 8);
529  output[3] = WRAPLOW(step2[3] + step2[12], 8);
530  output[4] = WRAPLOW(step2[4] + step2[11], 8);
531  output[5] = WRAPLOW(step2[5] + step2[10], 8);
532  output[6] = WRAPLOW(step2[6] + step2[9], 8);
533  output[7] = WRAPLOW(step2[7] + step2[8], 8);
534  output[8] = WRAPLOW(step2[7] - step2[8], 8);
535  output[9] = WRAPLOW(step2[6] - step2[9], 8);
536  output[10] = WRAPLOW(step2[5] - step2[10], 8);
537  output[11] = WRAPLOW(step2[4] - step2[11], 8);
538  output[12] = WRAPLOW(step2[3] - step2[12], 8);
539  output[13] = WRAPLOW(step2[2] - step2[13], 8);
540  output[14] = WRAPLOW(step2[1] - step2[14], 8);
541  output[15] = WRAPLOW(step2[0] - step2[15], 8);
542}
543
544void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
545                             int stride) {
546  tran_low_t out[16 * 16];
547  tran_low_t *outptr = out;
548  int i, j;
549  tran_low_t temp_in[16], temp_out[16];
550
551  // First transform rows
552  for (i = 0; i < 16; ++i) {
553    idct16_c(input, outptr);
554    input += 16;
555    outptr += 16;
556  }
557
558  // Then transform columns
559  for (i = 0; i < 16; ++i) {
560    for (j = 0; j < 16; ++j)
561      temp_in[j] = out[j * 16 + i];
562    idct16_c(temp_in, temp_out);
563    for (j = 0; j < 16; ++j) {
564      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
565                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
566    }
567  }
568}
569
570void iadst16_c(const tran_low_t *input, tran_low_t *output) {
571  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
572  tran_high_t s9, s10, s11, s12, s13, s14, s15;
573
574  tran_high_t x0 = input[15];
575  tran_high_t x1 = input[0];
576  tran_high_t x2 = input[13];
577  tran_high_t x3 = input[2];
578  tran_high_t x4 = input[11];
579  tran_high_t x5 = input[4];
580  tran_high_t x6 = input[9];
581  tran_high_t x7 = input[6];
582  tran_high_t x8 = input[7];
583  tran_high_t x9 = input[8];
584  tran_high_t x10 = input[5];
585  tran_high_t x11 = input[10];
586  tran_high_t x12 = input[3];
587  tran_high_t x13 = input[12];
588  tran_high_t x14 = input[1];
589  tran_high_t x15 = input[14];
590
591  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
592           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
593    output[0] = output[1] = output[2] = output[3] = output[4]
594              = output[5] = output[6] = output[7] = output[8]
595              = output[9] = output[10] = output[11] = output[12]
596              = output[13] = output[14] = output[15] = 0;
597    return;
598  }
599
600  // stage 1
601  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
602  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
603  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
604  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
605  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
606  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
607  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
608  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
609  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
610  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
611  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
612  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
613  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
614  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
615  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
616  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
617
618  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
619  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
620  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
621  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
622  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
623  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
624  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
625  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
626  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
627  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
628  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
629  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
630  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
631  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
632  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
633  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
634
635  // stage 2
636  s0 = x0;
637  s1 = x1;
638  s2 = x2;
639  s3 = x3;
640  s4 = x4;
641  s5 = x5;
642  s6 = x6;
643  s7 = x7;
644  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
645  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
646  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
647  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
648  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
649  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
650  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
651  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
652
653  x0 = WRAPLOW(s0 + s4, 8);
654  x1 = WRAPLOW(s1 + s5, 8);
655  x2 = WRAPLOW(s2 + s6, 8);
656  x3 = WRAPLOW(s3 + s7, 8);
657  x4 = WRAPLOW(s0 - s4, 8);
658  x5 = WRAPLOW(s1 - s5, 8);
659  x6 = WRAPLOW(s2 - s6, 8);
660  x7 = WRAPLOW(s3 - s7, 8);
661  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
662  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
663  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
664  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
665  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
666  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
667  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
668  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
669
670  // stage 3
671  s0 = x0;
672  s1 = x1;
673  s2 = x2;
674  s3 = x3;
675  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
676  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
677  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
678  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
679  s8 = x8;
680  s9 = x9;
681  s10 = x10;
682  s11 = x11;
683  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
684  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
685  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
686  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
687
688  x0 = WRAPLOW(check_range(s0 + s2), 8);
689  x1 = WRAPLOW(check_range(s1 + s3), 8);
690  x2 = WRAPLOW(check_range(s0 - s2), 8);
691  x3 = WRAPLOW(check_range(s1 - s3), 8);
692  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
693  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
694  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
695  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
696  x8 = WRAPLOW(check_range(s8 + s10), 8);
697  x9 = WRAPLOW(check_range(s9 + s11), 8);
698  x10 = WRAPLOW(check_range(s8 - s10), 8);
699  x11 = WRAPLOW(check_range(s9 - s11), 8);
700  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
701  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
702  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
703  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
704
705  // stage 4
706  s2 = (- cospi_16_64) * (x2 + x3);
707  s3 = cospi_16_64 * (x2 - x3);
708  s6 = cospi_16_64 * (x6 + x7);
709  s7 = cospi_16_64 * (- x6 + x7);
710  s10 = cospi_16_64 * (x10 + x11);
711  s11 = cospi_16_64 * (- x10 + x11);
712  s14 = (- cospi_16_64) * (x14 + x15);
713  s15 = cospi_16_64 * (x14 - x15);
714
715  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
716  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
717  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
718  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
719  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
720  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
721  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
722  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
723
724  output[0] = WRAPLOW(x0, 8);
725  output[1] = WRAPLOW(-x8, 8);
726  output[2] = WRAPLOW(x12, 8);
727  output[3] = WRAPLOW(-x4, 8);
728  output[4] = WRAPLOW(x6, 8);
729  output[5] = WRAPLOW(x14, 8);
730  output[6] = WRAPLOW(x10, 8);
731  output[7] = WRAPLOW(x2, 8);
732  output[8] = WRAPLOW(x3, 8);
733  output[9] = WRAPLOW(x11, 8);
734  output[10] = WRAPLOW(x15, 8);
735  output[11] = WRAPLOW(x7, 8);
736  output[12] = WRAPLOW(x5, 8);
737  output[13] = WRAPLOW(-x13, 8);
738  output[14] = WRAPLOW(x9, 8);
739  output[15] = WRAPLOW(-x1, 8);
740}
741
742void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
743                            int stride) {
744  tran_low_t out[16 * 16] = { 0 };
745  tran_low_t *outptr = out;
746  int i, j;
747  tran_low_t temp_in[16], temp_out[16];
748
749  // First transform rows. Since all non-zero dct coefficients are in
750  // upper-left 4x4 area, we only need to calculate first 4 rows here.
751  for (i = 0; i < 4; ++i) {
752    idct16_c(input, outptr);
753    input += 16;
754    outptr += 16;
755  }
756
757  // Then transform columns
758  for (i = 0; i < 16; ++i) {
759    for (j = 0; j < 16; ++j)
760      temp_in[j] = out[j*16 + i];
761    idct16_c(temp_in, temp_out);
762    for (j = 0; j < 16; ++j) {
763      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
764                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
765    }
766  }
767}
768
769void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
770  int i, j;
771  tran_high_t a1;
772  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
773  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
774  a1 = ROUND_POWER_OF_TWO(out, 6);
775  for (j = 0; j < 16; ++j) {
776    for (i = 0; i < 16; ++i)
777      dest[i] = clip_pixel_add(dest[i], a1);
778    dest += stride;
779  }
780}
781
782void idct32_c(const tran_low_t *input, tran_low_t *output) {
783  tran_low_t step1[32], step2[32];
784  tran_high_t temp1, temp2;
785
786  // stage 1
787  step1[0] = input[0];
788  step1[1] = input[16];
789  step1[2] = input[8];
790  step1[3] = input[24];
791  step1[4] = input[4];
792  step1[5] = input[20];
793  step1[6] = input[12];
794  step1[7] = input[28];
795  step1[8] = input[2];
796  step1[9] = input[18];
797  step1[10] = input[10];
798  step1[11] = input[26];
799  step1[12] = input[6];
800  step1[13] = input[22];
801  step1[14] = input[14];
802  step1[15] = input[30];
803
804  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
805  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
806  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
807  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
808
809  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
810  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
811  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
812  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
813
814  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
815  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
816  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
817  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
818
819  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
820  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
821  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
822  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
823
824  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
825  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
826  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
827  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
828
829  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
830  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
831  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
832  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
833
834  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
835  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
836  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
837  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
838
839  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
840  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
841  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
842  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
843
844  // stage 2
845  step2[0] = step1[0];
846  step2[1] = step1[1];
847  step2[2] = step1[2];
848  step2[3] = step1[3];
849  step2[4] = step1[4];
850  step2[5] = step1[5];
851  step2[6] = step1[6];
852  step2[7] = step1[7];
853
854  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
855  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
856  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
857  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
858
859  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
860  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
861  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
862  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
863
864  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
865  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
866  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
867  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
868
869  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
870  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
871  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
872  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
873
874  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
875  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
876  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
877  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
878  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
879  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
880  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
881  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
882  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
883  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
884  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
885  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
886  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
887  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
888  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
889  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
890
891  // stage 3
892  step1[0] = step2[0];
893  step1[1] = step2[1];
894  step1[2] = step2[2];
895  step1[3] = step2[3];
896
897  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
898  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
899  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
900  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
901  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
902  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
903  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
904  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
905
906  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
907  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
908  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
909  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
910  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
911  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
912  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
913  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
914
915  step1[16] = step2[16];
916  step1[31] = step2[31];
917  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
918  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
919  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
920  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
921  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
922  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
923  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
924  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
925  step1[19] = step2[19];
926  step1[20] = step2[20];
927  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
928  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
929  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
930  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
931  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
932  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
933  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
934  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
935  step1[23] = step2[23];
936  step1[24] = step2[24];
937  step1[27] = step2[27];
938  step1[28] = step2[28];
939
940  // stage 4
941  temp1 = (step1[0] + step1[1]) * cospi_16_64;
942  temp2 = (step1[0] - step1[1]) * cospi_16_64;
943  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
944  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
945  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
946  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
947  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
948  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
949  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
950  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
951  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
952  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
953
954  step2[8] = step1[8];
955  step2[15] = step1[15];
956  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
957  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
958  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
959  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
960  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
961  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
962  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
963  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
964  step2[11] = step1[11];
965  step2[12] = step1[12];
966
967  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
968  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
969  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
970  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
971  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
972  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
973  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
974  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
975
976  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
977  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
978  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
979  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
980  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
981  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
982  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
983  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
984
985  // stage 5
986  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
987  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
988  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
989  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
990  step1[4] = step2[4];
991  temp1 = (step2[6] - step2[5]) * cospi_16_64;
992  temp2 = (step2[5] + step2[6]) * cospi_16_64;
993  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
994  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
995  step1[7] = step2[7];
996
997  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
998  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
999  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
1000  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
1001  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
1002  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
1003  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
1004  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
1005
1006  step1[16] = step2[16];
1007  step1[17] = step2[17];
1008  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1009  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1010  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
1011  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
1012  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1013  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1014  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
1015  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
1016  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1017  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1018  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1019  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1020  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1021  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1022  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1023  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1024  step1[22] = step2[22];
1025  step1[23] = step2[23];
1026  step1[24] = step2[24];
1027  step1[25] = step2[25];
1028  step1[30] = step2[30];
1029  step1[31] = step2[31];
1030
1031  // stage 6
1032  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
1033  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
1034  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
1035  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
1036  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
1037  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
1038  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
1039  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
1040  step2[8] = step1[8];
1041  step2[9] = step1[9];
1042  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1043  temp2 = (step1[10] + step1[13]) * cospi_16_64;
1044  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1045  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1046  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1047  temp2 = (step1[11] + step1[12]) * cospi_16_64;
1048  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
1049  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
1050  step2[14] = step1[14];
1051  step2[15] = step1[15];
1052
1053  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
1054  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
1055  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
1056  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
1057  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
1058  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
1059  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
1060  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
1061
1062  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
1063  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
1064  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
1065  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
1066  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
1067  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
1068  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
1069  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
1070
1071  // stage 7
1072  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
1073  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
1074  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
1075  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
1076  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
1077  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
1078  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
1079  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
1080  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
1081  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
1082  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
1083  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
1084  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
1085  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
1086  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
1087  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
1088
1089  step1[16] = step2[16];
1090  step1[17] = step2[17];
1091  step1[18] = step2[18];
1092  step1[19] = step2[19];
1093  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1094  temp2 = (step2[20] + step2[27]) * cospi_16_64;
1095  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1096  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1097  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1098  temp2 = (step2[21] + step2[26]) * cospi_16_64;
1099  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1100  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1101  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1102  temp2 = (step2[22] + step2[25]) * cospi_16_64;
1103  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
1104  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
1105  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1106  temp2 = (step2[23] + step2[24]) * cospi_16_64;
1107  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
1108  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
1109  step1[28] = step2[28];
1110  step1[29] = step2[29];
1111  step1[30] = step2[30];
1112  step1[31] = step2[31];
1113
1114  // final stage
1115  output[0] = WRAPLOW(step1[0] + step1[31], 8);
1116  output[1] = WRAPLOW(step1[1] + step1[30], 8);
1117  output[2] = WRAPLOW(step1[2] + step1[29], 8);
1118  output[3] = WRAPLOW(step1[3] + step1[28], 8);
1119  output[4] = WRAPLOW(step1[4] + step1[27], 8);
1120  output[5] = WRAPLOW(step1[5] + step1[26], 8);
1121  output[6] = WRAPLOW(step1[6] + step1[25], 8);
1122  output[7] = WRAPLOW(step1[7] + step1[24], 8);
1123  output[8] = WRAPLOW(step1[8] + step1[23], 8);
1124  output[9] = WRAPLOW(step1[9] + step1[22], 8);
1125  output[10] = WRAPLOW(step1[10] + step1[21], 8);
1126  output[11] = WRAPLOW(step1[11] + step1[20], 8);
1127  output[12] = WRAPLOW(step1[12] + step1[19], 8);
1128  output[13] = WRAPLOW(step1[13] + step1[18], 8);
1129  output[14] = WRAPLOW(step1[14] + step1[17], 8);
1130  output[15] = WRAPLOW(step1[15] + step1[16], 8);
1131  output[16] = WRAPLOW(step1[15] - step1[16], 8);
1132  output[17] = WRAPLOW(step1[14] - step1[17], 8);
1133  output[18] = WRAPLOW(step1[13] - step1[18], 8);
1134  output[19] = WRAPLOW(step1[12] - step1[19], 8);
1135  output[20] = WRAPLOW(step1[11] - step1[20], 8);
1136  output[21] = WRAPLOW(step1[10] - step1[21], 8);
1137  output[22] = WRAPLOW(step1[9] - step1[22], 8);
1138  output[23] = WRAPLOW(step1[8] - step1[23], 8);
1139  output[24] = WRAPLOW(step1[7] - step1[24], 8);
1140  output[25] = WRAPLOW(step1[6] - step1[25], 8);
1141  output[26] = WRAPLOW(step1[5] - step1[26], 8);
1142  output[27] = WRAPLOW(step1[4] - step1[27], 8);
1143  output[28] = WRAPLOW(step1[3] - step1[28], 8);
1144  output[29] = WRAPLOW(step1[2] - step1[29], 8);
1145  output[30] = WRAPLOW(step1[1] - step1[30], 8);
1146  output[31] = WRAPLOW(step1[0] - step1[31], 8);
1147}
1148
1149void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1150                              int stride) {
1151  tran_low_t out[32 * 32];
1152  tran_low_t *outptr = out;
1153  int i, j;
1154  tran_low_t temp_in[32], temp_out[32];
1155
1156  // Rows
1157  for (i = 0; i < 32; ++i) {
1158    int16_t zero_coeff[16];
1159    for (j = 0; j < 16; ++j)
1160      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1161    for (j = 0; j < 8; ++j)
1162      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1163    for (j = 0; j < 4; ++j)
1164      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1165    for (j = 0; j < 2; ++j)
1166      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1167
1168    if (zero_coeff[0] | zero_coeff[1])
1169      idct32_c(input, outptr);
1170    else
1171      memset(outptr, 0, sizeof(tran_low_t) * 32);
1172    input += 32;
1173    outptr += 32;
1174  }
1175
1176  // Columns
1177  for (i = 0; i < 32; ++i) {
1178    for (j = 0; j < 32; ++j)
1179      temp_in[j] = out[j * 32 + i];
1180    idct32_c(temp_in, temp_out);
1181    for (j = 0; j < 32; ++j) {
1182      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1183                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
1184    }
1185  }
1186}
1187
1188void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1189                            int stride) {
1190  tran_low_t out[32 * 32] = {0};
1191  tran_low_t *outptr = out;
1192  int i, j;
1193  tran_low_t temp_in[32], temp_out[32];
1194
1195  // Rows
1196  // only upper-left 8x8 has non-zero coeff
1197  for (i = 0; i < 8; ++i) {
1198    idct32_c(input, outptr);
1199    input += 32;
1200    outptr += 32;
1201  }
1202
1203  // Columns
1204  for (i = 0; i < 32; ++i) {
1205    for (j = 0; j < 32; ++j)
1206      temp_in[j] = out[j * 32 + i];
1207    idct32_c(temp_in, temp_out);
1208    for (j = 0; j < 32; ++j) {
1209      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1210                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
1211    }
1212  }
1213}
1214
1215void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1216  int i, j;
1217  tran_high_t a1;
1218
1219  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
1220  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
1221  a1 = ROUND_POWER_OF_TWO(out, 6);
1222
1223  for (j = 0; j < 32; ++j) {
1224    for (i = 0; i < 32; ++i)
1225      dest[i] = clip_pixel_add(dest[i], a1);
1226    dest += stride;
1227  }
1228}
1229
1230#if CONFIG_VP9_HIGHBITDEPTH
1231void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1232                                 int stride, int bd) {
1233  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1234     0.5 shifts per pixel. */
1235  int i;
1236  tran_low_t output[16];
1237  tran_high_t a1, b1, c1, d1, e1;
1238  const tran_low_t *ip = input;
1239  tran_low_t *op = output;
1240  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1241
1242  for (i = 0; i < 4; i++) {
1243    a1 = ip[0] >> UNIT_QUANT_SHIFT;
1244    c1 = ip[1] >> UNIT_QUANT_SHIFT;
1245    d1 = ip[2] >> UNIT_QUANT_SHIFT;
1246    b1 = ip[3] >> UNIT_QUANT_SHIFT;
1247    a1 += c1;
1248    d1 -= b1;
1249    e1 = (a1 - d1) >> 1;
1250    b1 = e1 - b1;
1251    c1 = e1 - c1;
1252    a1 -= b1;
1253    d1 += c1;
1254    op[0] = WRAPLOW(a1, bd);
1255    op[1] = WRAPLOW(b1, bd);
1256    op[2] = WRAPLOW(c1, bd);
1257    op[3] = WRAPLOW(d1, bd);
1258    ip += 4;
1259    op += 4;
1260  }
1261
1262  ip = output;
1263  for (i = 0; i < 4; i++) {
1264    a1 = ip[4 * 0];
1265    c1 = ip[4 * 1];
1266    d1 = ip[4 * 2];
1267    b1 = ip[4 * 3];
1268    a1 += c1;
1269    d1 -= b1;
1270    e1 = (a1 - d1) >> 1;
1271    b1 = e1 - b1;
1272    c1 = e1 - c1;
1273    a1 -= b1;
1274    d1 += c1;
1275    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1276    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
1277    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
1278    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
1279
1280    ip++;
1281    dest++;
1282  }
1283}
1284
1285void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1286                                int dest_stride, int bd) {
1287  int i;
1288  tran_high_t a1, e1;
1289  tran_low_t tmp[4];
1290  const tran_low_t *ip = in;
1291  tran_low_t *op = tmp;
1292  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1293  (void) bd;
1294
1295  a1 = ip[0] >> UNIT_QUANT_SHIFT;
1296  e1 = a1 >> 1;
1297  a1 -= e1;
1298  op[0] = WRAPLOW(a1, bd);
1299  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
1300
1301  ip = tmp;
1302  for (i = 0; i < 4; i++) {
1303    e1 = ip[0] >> 1;
1304    a1 = ip[0] - e1;
1305    dest[dest_stride * 0] = highbd_clip_pixel_add(
1306        dest[dest_stride * 0], a1, bd);
1307    dest[dest_stride * 1] = highbd_clip_pixel_add(
1308        dest[dest_stride * 1], e1, bd);
1309    dest[dest_stride * 2] = highbd_clip_pixel_add(
1310        dest[dest_stride * 2], e1, bd);
1311    dest[dest_stride * 3] = highbd_clip_pixel_add(
1312        dest[dest_stride * 3], e1, bd);
1313    ip++;
1314    dest++;
1315  }
1316}
1317
1318void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1319  tran_low_t step[4];
1320  tran_high_t temp1, temp2;
1321  (void) bd;
1322  // stage 1
1323  temp1 = (input[0] + input[2]) * cospi_16_64;
1324  temp2 = (input[0] - input[2]) * cospi_16_64;
1325  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1326  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1327  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1328  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1329  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1330  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1331
1332  // stage 2
1333  output[0] = WRAPLOW(step[0] + step[3], bd);
1334  output[1] = WRAPLOW(step[1] + step[2], bd);
1335  output[2] = WRAPLOW(step[1] - step[2], bd);
1336  output[3] = WRAPLOW(step[0] - step[3], bd);
1337}
1338
1339void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1340                                 int stride, int bd) {
1341  tran_low_t out[4 * 4];
1342  tran_low_t *outptr = out;
1343  int i, j;
1344  tran_low_t temp_in[4], temp_out[4];
1345  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1346
1347  // Rows
1348  for (i = 0; i < 4; ++i) {
1349    vpx_highbd_idct4_c(input, outptr, bd);
1350    input += 4;
1351    outptr += 4;
1352  }
1353
1354  // Columns
1355  for (i = 0; i < 4; ++i) {
1356    for (j = 0; j < 4; ++j)
1357      temp_in[j] = out[j * 4 + i];
1358    vpx_highbd_idct4_c(temp_in, temp_out, bd);
1359    for (j = 0; j < 4; ++j) {
1360      dest[j * stride + i] = highbd_clip_pixel_add(
1361          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1362    }
1363  }
1364}
1365
1366void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1367                                int dest_stride, int bd) {
1368  int i;
1369  tran_high_t a1;
1370  tran_low_t out = WRAPLOW(
1371      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1372  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1373
1374  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1375  a1 = ROUND_POWER_OF_TWO(out, 4);
1376
1377  for (i = 0; i < 4; i++) {
1378    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1379    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1380    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1381    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1382    dest += dest_stride;
1383  }
1384}
1385
1386void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1387  tran_low_t step1[8], step2[8];
1388  tran_high_t temp1, temp2;
1389  // stage 1
1390  step1[0] = input[0];
1391  step1[2] = input[4];
1392  step1[1] = input[2];
1393  step1[3] = input[6];
1394  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1395  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1396  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1397  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1398  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1399  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1400  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1401  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1402
1403  // stage 2 & stage 3 - even half
1404  vpx_highbd_idct4_c(step1, step1, bd);
1405
1406  // stage 2 - odd half
1407  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1408  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1409  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1410  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1411
1412  // stage 3 - odd half
1413  step1[4] = step2[4];
1414  temp1 = (step2[6] - step2[5]) * cospi_16_64;
1415  temp2 = (step2[5] + step2[6]) * cospi_16_64;
1416  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1417  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1418  step1[7] = step2[7];
1419
1420  // stage 4
1421  output[0] = WRAPLOW(step1[0] + step1[7], bd);
1422  output[1] = WRAPLOW(step1[1] + step1[6], bd);
1423  output[2] = WRAPLOW(step1[2] + step1[5], bd);
1424  output[3] = WRAPLOW(step1[3] + step1[4], bd);
1425  output[4] = WRAPLOW(step1[3] - step1[4], bd);
1426  output[5] = WRAPLOW(step1[2] - step1[5], bd);
1427  output[6] = WRAPLOW(step1[1] - step1[6], bd);
1428  output[7] = WRAPLOW(step1[0] - step1[7], bd);
1429}
1430
1431void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1432                                 int stride, int bd) {
1433  tran_low_t out[8 * 8];
1434  tran_low_t *outptr = out;
1435  int i, j;
1436  tran_low_t temp_in[8], temp_out[8];
1437  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1438
1439  // First transform rows.
1440  for (i = 0; i < 8; ++i) {
1441    vpx_highbd_idct8_c(input, outptr, bd);
1442    input += 8;
1443    outptr += 8;
1444  }
1445
1446  // Then transform columns.
1447  for (i = 0; i < 8; ++i) {
1448    for (j = 0; j < 8; ++j)
1449      temp_in[j] = out[j * 8 + i];
1450    vpx_highbd_idct8_c(temp_in, temp_out, bd);
1451    for (j = 0; j < 8; ++j) {
1452      dest[j * stride + i] = highbd_clip_pixel_add(
1453          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1454    }
1455  }
1456}
1457
1458void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1459                                int stride, int bd) {
1460  int i, j;
1461  tran_high_t a1;
1462  tran_low_t out = WRAPLOW(
1463      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1464  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1465  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1466  a1 = ROUND_POWER_OF_TWO(out, 5);
1467  for (j = 0; j < 8; ++j) {
1468    for (i = 0; i < 8; ++i)
1469      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1470    dest += stride;
1471  }
1472}
1473
1474void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1475  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1476
1477  tran_low_t x0 = input[0];
1478  tran_low_t x1 = input[1];
1479  tran_low_t x2 = input[2];
1480  tran_low_t x3 = input[3];
1481  (void) bd;
1482
1483  if (!(x0 | x1 | x2 | x3)) {
1484    memset(output, 0, 4 * sizeof(*output));
1485    return;
1486  }
1487
1488  s0 = sinpi_1_9 * x0;
1489  s1 = sinpi_2_9 * x0;
1490  s2 = sinpi_3_9 * x1;
1491  s3 = sinpi_4_9 * x2;
1492  s4 = sinpi_1_9 * x2;
1493  s5 = sinpi_2_9 * x3;
1494  s6 = sinpi_4_9 * x3;
1495  s7 = (tran_high_t)(x0 - x2 + x3);
1496
1497  s0 = s0 + s3 + s5;
1498  s1 = s1 - s4 - s6;
1499  s3 = s2;
1500  s2 = sinpi_3_9 * s7;
1501
1502  // 1-D transform scaling factor is sqrt(2).
1503  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1504  // + 1b (addition) = 29b.
1505  // Hence the output bit depth is 15b.
1506  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
1507  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
1508  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1509  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
1510}
1511
1512void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1513  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1514
1515  tran_low_t x0 = input[7];
1516  tran_low_t x1 = input[0];
1517  tran_low_t x2 = input[5];
1518  tran_low_t x3 = input[2];
1519  tran_low_t x4 = input[3];
1520  tran_low_t x5 = input[4];
1521  tran_low_t x6 = input[1];
1522  tran_low_t x7 = input[6];
1523  (void) bd;
1524
1525  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1526    memset(output, 0, 8 * sizeof(*output));
1527    return;
1528  }
1529
1530  // stage 1
1531  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
1532  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
1533  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1534  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1535  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1536  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1537  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
1538  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
1539
1540  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
1541  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
1542  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
1543  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
1544  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
1545  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
1546  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
1547  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
1548
1549  // stage 2
1550  s0 = x0;
1551  s1 = x1;
1552  s2 = x2;
1553  s3 = x3;
1554  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
1555  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
1556  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
1557  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
1558
1559  x0 = WRAPLOW(s0 + s2, bd);
1560  x1 = WRAPLOW(s1 + s3, bd);
1561  x2 = WRAPLOW(s0 - s2, bd);
1562  x3 = WRAPLOW(s1 - s3, bd);
1563  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1564  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1565  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1566  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1567
1568  // stage 3
1569  s2 = cospi_16_64 * (x2 + x3);
1570  s3 = cospi_16_64 * (x2 - x3);
1571  s6 = cospi_16_64 * (x6 + x7);
1572  s7 = cospi_16_64 * (x6 - x7);
1573
1574  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1575  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1576  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1577  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1578
1579  output[0] = WRAPLOW(x0, bd);
1580  output[1] = WRAPLOW(-x4, bd);
1581  output[2] = WRAPLOW(x6, bd);
1582  output[3] = WRAPLOW(-x2, bd);
1583  output[4] = WRAPLOW(x3, bd);
1584  output[5] = WRAPLOW(-x7, bd);
1585  output[6] = WRAPLOW(x5, bd);
1586  output[7] = WRAPLOW(-x1, bd);
1587}
1588
1589void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
1590                                 int stride, int bd) {
1591  tran_low_t out[8 * 8] = { 0 };
1592  tran_low_t *outptr = out;
1593  int i, j;
1594  tran_low_t temp_in[8], temp_out[8];
1595  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1596
1597  // First transform rows.
1598  // Only first 4 row has non-zero coefs.
1599  for (i = 0; i < 4; ++i) {
1600    vpx_highbd_idct8_c(input, outptr, bd);
1601    input += 8;
1602    outptr += 8;
1603  }
1604  // Then transform columns.
1605  for (i = 0; i < 8; ++i) {
1606    for (j = 0; j < 8; ++j)
1607      temp_in[j] = out[j * 8 + i];
1608    vpx_highbd_idct8_c(temp_in, temp_out, bd);
1609    for (j = 0; j < 8; ++j) {
1610      dest[j * stride + i] = highbd_clip_pixel_add(
1611          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1612    }
1613  }
1614}
1615
1616void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1617  tran_low_t step1[16], step2[16];
1618  tran_high_t temp1, temp2;
1619  (void) bd;
1620
1621  // stage 1
1622  step1[0] = input[0/2];
1623  step1[1] = input[16/2];
1624  step1[2] = input[8/2];
1625  step1[3] = input[24/2];
1626  step1[4] = input[4/2];
1627  step1[5] = input[20/2];
1628  step1[6] = input[12/2];
1629  step1[7] = input[28/2];
1630  step1[8] = input[2/2];
1631  step1[9] = input[18/2];
1632  step1[10] = input[10/2];
1633  step1[11] = input[26/2];
1634  step1[12] = input[6/2];
1635  step1[13] = input[22/2];
1636  step1[14] = input[14/2];
1637  step1[15] = input[30/2];
1638
1639  // stage 2
1640  step2[0] = step1[0];
1641  step2[1] = step1[1];
1642  step2[2] = step1[2];
1643  step2[3] = step1[3];
1644  step2[4] = step1[4];
1645  step2[5] = step1[5];
1646  step2[6] = step1[6];
1647  step2[7] = step1[7];
1648
1649  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1650  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1651  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1652  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1653
1654  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1655  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1656  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1657  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1658
1659  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1660  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1661  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1662  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1663
1664  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1665  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1666  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1667  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1668
1669  // stage 3
1670  step1[0] = step2[0];
1671  step1[1] = step2[1];
1672  step1[2] = step2[2];
1673  step1[3] = step2[3];
1674
1675  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1676  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1677  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1678  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1679  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1680  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1681  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1682  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1683
1684  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
1685  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
1686  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
1687  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
1688  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
1689  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
1690  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
1691  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
1692
1693  // stage 4
1694  temp1 = (step1[0] + step1[1]) * cospi_16_64;
1695  temp2 = (step1[0] - step1[1]) * cospi_16_64;
1696  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1697  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1698  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1699  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1700  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1701  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1702  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1703  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1704  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1705  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1706
1707  step2[8] = step1[8];
1708  step2[15] = step1[15];
1709  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1710  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1711  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1712  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1713  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1714  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1715  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1716  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1717  step2[11] = step1[11];
1718  step2[12] = step1[12];
1719
1720  // stage 5
1721  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
1722  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
1723  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
1724  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
1725  step1[4] = step2[4];
1726  temp1 = (step2[6] - step2[5]) * cospi_16_64;
1727  temp2 = (step2[5] + step2[6]) * cospi_16_64;
1728  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1729  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1730  step1[7] = step2[7];
1731
1732  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
1733  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
1734  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
1735  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
1736  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
1737  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
1738  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
1739  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
1740
1741  // stage 6
1742  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
1743  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
1744  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
1745  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
1746  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
1747  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
1748  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
1749  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
1750  step2[8] = step1[8];
1751  step2[9] = step1[9];
1752  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1753  temp2 = (step1[10] + step1[13]) * cospi_16_64;
1754  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1755  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1756  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1757  temp2 = (step1[11] + step1[12]) * cospi_16_64;
1758  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1759  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1760  step2[14] = step1[14];
1761  step2[15] = step1[15];
1762
1763  // stage 7
1764  output[0] = WRAPLOW(step2[0] + step2[15], bd);
1765  output[1] = WRAPLOW(step2[1] + step2[14], bd);
1766  output[2] = WRAPLOW(step2[2] + step2[13], bd);
1767  output[3] = WRAPLOW(step2[3] + step2[12], bd);
1768  output[4] = WRAPLOW(step2[4] + step2[11], bd);
1769  output[5] = WRAPLOW(step2[5] + step2[10], bd);
1770  output[6] = WRAPLOW(step2[6] + step2[9], bd);
1771  output[7] = WRAPLOW(step2[7] + step2[8], bd);
1772  output[8] = WRAPLOW(step2[7] - step2[8], bd);
1773  output[9] = WRAPLOW(step2[6] - step2[9], bd);
1774  output[10] = WRAPLOW(step2[5] - step2[10], bd);
1775  output[11] = WRAPLOW(step2[4] - step2[11], bd);
1776  output[12] = WRAPLOW(step2[3] - step2[12], bd);
1777  output[13] = WRAPLOW(step2[2] - step2[13], bd);
1778  output[14] = WRAPLOW(step2[1] - step2[14], bd);
1779  output[15] = WRAPLOW(step2[0] - step2[15], bd);
1780}
1781
1782void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
1783                                    int stride, int bd) {
1784  tran_low_t out[16 * 16];
1785  tran_low_t *outptr = out;
1786  int i, j;
1787  tran_low_t temp_in[16], temp_out[16];
1788  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1789
1790  // First transform rows.
1791  for (i = 0; i < 16; ++i) {
1792    vpx_highbd_idct16_c(input, outptr, bd);
1793    input += 16;
1794    outptr += 16;
1795  }
1796
1797  // Then transform columns.
1798  for (i = 0; i < 16; ++i) {
1799    for (j = 0; j < 16; ++j)
1800      temp_in[j] = out[j * 16 + i];
1801    vpx_highbd_idct16_c(temp_in, temp_out, bd);
1802    for (j = 0; j < 16; ++j) {
1803      dest[j * stride + i] = highbd_clip_pixel_add(
1804          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
1805    }
1806  }
1807}
1808
1809void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1810  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1811  tran_high_t s9, s10, s11, s12, s13, s14, s15;
1812
1813  tran_low_t x0 = input[15];
1814  tran_low_t x1 = input[0];
1815  tran_low_t x2 = input[13];
1816  tran_low_t x3 = input[2];
1817  tran_low_t x4 = input[11];
1818  tran_low_t x5 = input[4];
1819  tran_low_t x6 = input[9];
1820  tran_low_t x7 = input[6];
1821  tran_low_t x8 = input[7];
1822  tran_low_t x9 = input[8];
1823  tran_low_t x10 = input[5];
1824  tran_low_t x11 = input[10];
1825  tran_low_t x12 = input[3];
1826  tran_low_t x13 = input[12];
1827  tran_low_t x14 = input[1];
1828  tran_low_t x15 = input[14];
1829  (void) bd;
1830
1831  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
1832           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
1833    memset(output, 0, 16 * sizeof(*output));
1834    return;
1835  }
1836
1837  // stage 1
1838  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
1839  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1840  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
1841  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1842  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
1843  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1844  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1845  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1846  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1847  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1848  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1849  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1850  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1851  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
1852  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1853  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
1854
1855  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
1856  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
1857  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
1858  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
1859  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
1860  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
1861  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
1862  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
1863  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
1864  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
1865  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
1866  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
1867  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
1868  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
1869  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
1870  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
1871
1872  // stage 2
1873  s0 = x0;
1874  s1 = x1;
1875  s2 = x2;
1876  s3 = x3;
1877  s4 = x4;
1878  s5 = x5;
1879  s6 = x6;
1880  s7 = x7;
1881  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1882  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1883  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1884  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1885  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1886  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1887  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1888  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1889
1890  x0 = WRAPLOW(s0 + s4, bd);
1891  x1 = WRAPLOW(s1 + s5, bd);
1892  x2 = WRAPLOW(s2 + s6, bd);
1893  x3 = WRAPLOW(s3 + s7, bd);
1894  x4 = WRAPLOW(s0 - s4, bd);
1895  x5 = WRAPLOW(s1 - s5, bd);
1896  x6 = WRAPLOW(s2 - s6, bd);
1897  x7 = WRAPLOW(s3 - s7, bd);
1898  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
1899  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
1900  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
1901  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
1902  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
1903  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
1904  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
1905  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
1906
1907  // stage 3
1908  s0 = x0;
1909  s1 = x1;
1910  s2 = x2;
1911  s3 = x3;
1912  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1913  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1914  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1915  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1916  s8 = x8;
1917  s9 = x9;
1918  s10 = x10;
1919  s11 = x11;
1920  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1921  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1922  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1923  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1924
1925  x0 = WRAPLOW(s0 + s2, bd);
1926  x1 = WRAPLOW(s1 + s3, bd);
1927  x2 = WRAPLOW(s0 - s2, bd);
1928  x3 = WRAPLOW(s1 - s3, bd);
1929  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1930  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1931  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1932  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1933  x8 = WRAPLOW(s8 + s10, bd);
1934  x9 = WRAPLOW(s9 + s11, bd);
1935  x10 = WRAPLOW(s8 - s10, bd);
1936  x11 = WRAPLOW(s9 - s11, bd);
1937  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
1938  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
1939  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
1940  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
1941
1942  // stage 4
1943  s2 = (- cospi_16_64) * (x2 + x3);
1944  s3 = cospi_16_64 * (x2 - x3);
1945  s6 = cospi_16_64 * (x6 + x7);
1946  s7 = cospi_16_64 * (-x6 + x7);
1947  s10 = cospi_16_64 * (x10 + x11);
1948  s11 = cospi_16_64 * (-x10 + x11);
1949  s14 = (- cospi_16_64) * (x14 + x15);
1950  s15 = cospi_16_64 * (x14 - x15);
1951
1952  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1953  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1954  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1955  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1956  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
1957  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
1958  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
1959  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
1960
1961  output[0] = WRAPLOW(x0, bd);
1962  output[1] = WRAPLOW(-x8, bd);
1963  output[2] = WRAPLOW(x12, bd);
1964  output[3] = WRAPLOW(-x4, bd);
1965  output[4] = WRAPLOW(x6, bd);
1966  output[5] = WRAPLOW(x14, bd);
1967  output[6] = WRAPLOW(x10, bd);
1968  output[7] = WRAPLOW(x2, bd);
1969  output[8] = WRAPLOW(x3, bd);
1970  output[9] = WRAPLOW(x11, bd);
1971  output[10] = WRAPLOW(x15, bd);
1972  output[11] = WRAPLOW(x7, bd);
1973  output[12] = WRAPLOW(x5, bd);
1974  output[13] = WRAPLOW(-x13, bd);
1975  output[14] = WRAPLOW(x9, bd);
1976  output[15] = WRAPLOW(-x1, bd);
1977}
1978
1979void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
1980                                   int stride, int bd) {
1981  tran_low_t out[16 * 16] = { 0 };
1982  tran_low_t *outptr = out;
1983  int i, j;
1984  tran_low_t temp_in[16], temp_out[16];
1985  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1986
1987  // First transform rows. Since all non-zero dct coefficients are in
1988  // upper-left 4x4 area, we only need to calculate first 4 rows here.
1989  for (i = 0; i < 4; ++i) {
1990    vpx_highbd_idct16_c(input, outptr, bd);
1991    input += 16;
1992    outptr += 16;
1993  }
1994
1995  // Then transform columns.
1996  for (i = 0; i < 16; ++i) {
1997    for (j = 0; j < 16; ++j)
1998      temp_in[j] = out[j*16 + i];
1999    vpx_highbd_idct16_c(temp_in, temp_out, bd);
2000    for (j = 0; j < 16; ++j) {
2001      dest[j * stride + i] = highbd_clip_pixel_add(
2002          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2003    }
2004  }
2005}
2006
2007void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2008                                  int stride, int bd) {
2009  int i, j;
2010  tran_high_t a1;
2011  tran_low_t out = WRAPLOW(
2012      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2013  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2014
2015  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2016  a1 = ROUND_POWER_OF_TWO(out, 6);
2017  for (j = 0; j < 16; ++j) {
2018    for (i = 0; i < 16; ++i)
2019      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2020    dest += stride;
2021  }
2022}
2023
2024static void highbd_idct32_c(const tran_low_t *input,
2025                            tran_low_t *output, int bd) {
2026  tran_low_t step1[32], step2[32];
2027  tran_high_t temp1, temp2;
2028  (void) bd;
2029
2030  // stage 1
2031  step1[0] = input[0];
2032  step1[1] = input[16];
2033  step1[2] = input[8];
2034  step1[3] = input[24];
2035  step1[4] = input[4];
2036  step1[5] = input[20];
2037  step1[6] = input[12];
2038  step1[7] = input[28];
2039  step1[8] = input[2];
2040  step1[9] = input[18];
2041  step1[10] = input[10];
2042  step1[11] = input[26];
2043  step1[12] = input[6];
2044  step1[13] = input[22];
2045  step1[14] = input[14];
2046  step1[15] = input[30];
2047
2048  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2049  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2050  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2051  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2052
2053  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2054  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2055  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2056  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2057
2058  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2059  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2060  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2061  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2062
2063  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2064  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2065  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2066  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2067
2068  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2069  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2070  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2071  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2072
2073  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2074  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2075  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2076  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2077
2078  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2079  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2080  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2081  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2082
2083  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2084  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2085  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2086  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2087
2088  // stage 2
2089  step2[0] = step1[0];
2090  step2[1] = step1[1];
2091  step2[2] = step1[2];
2092  step2[3] = step1[3];
2093  step2[4] = step1[4];
2094  step2[5] = step1[5];
2095  step2[6] = step1[6];
2096  step2[7] = step1[7];
2097
2098  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2099  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2100  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2101  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2102
2103  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2104  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2105  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2106  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2107
2108  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2109  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2110  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2111  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2112
2113  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2114  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2115  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2116  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2117
2118  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
2119  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
2120  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
2121  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
2122  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
2123  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
2124  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
2125  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
2126  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
2127  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
2128  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
2129  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
2130  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
2131  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
2132  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
2133  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
2134
2135  // stage 3
2136  step1[0] = step2[0];
2137  step1[1] = step2[1];
2138  step1[2] = step2[2];
2139  step1[3] = step2[3];
2140
2141  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2142  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2143  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2144  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2145  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2146  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2147  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2148  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2149
2150  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
2151  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
2152  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
2153  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
2154  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
2155  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
2156  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
2157  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
2158
2159  step1[16] = step2[16];
2160  step1[31] = step2[31];
2161  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2162  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2163  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2164  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2165  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2166  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2167  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2168  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2169  step1[19] = step2[19];
2170  step1[20] = step2[20];
2171  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2172  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2173  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2174  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2175  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2176  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2177  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2178  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2179  step1[23] = step2[23];
2180  step1[24] = step2[24];
2181  step1[27] = step2[27];
2182  step1[28] = step2[28];
2183
2184  // stage 4
2185  temp1 = (step1[0] + step1[1]) * cospi_16_64;
2186  temp2 = (step1[0] - step1[1]) * cospi_16_64;
2187  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2188  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2189  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2190  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2191  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2192  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2193  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
2194  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
2195  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
2196  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
2197
2198  step2[8] = step1[8];
2199  step2[15] = step1[15];
2200  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2201  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2202  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2203  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2204  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2205  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2206  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2207  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2208  step2[11] = step1[11];
2209  step2[12] = step1[12];
2210
2211  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
2212  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
2213  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
2214  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
2215  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
2216  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
2217  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
2218  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
2219
2220  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
2221  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
2222  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
2223  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
2224  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
2225  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
2226  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
2227  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
2228
2229  // stage 5
2230  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
2231  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
2232  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
2233  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
2234  step1[4] = step2[4];
2235  temp1 = (step2[6] - step2[5]) * cospi_16_64;
2236  temp2 = (step2[5] + step2[6]) * cospi_16_64;
2237  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2238  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2239  step1[7] = step2[7];
2240
2241  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
2242  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
2243  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
2244  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
2245  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
2246  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
2247  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
2248  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
2249
2250  step1[16] = step2[16];
2251  step1[17] = step2[17];
2252  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2253  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2254  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2255  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2256  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2257  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2258  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2259  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2260  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2261  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2262  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2263  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2264  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2265  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2266  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2267  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2268  step1[22] = step2[22];
2269  step1[23] = step2[23];
2270  step1[24] = step2[24];
2271  step1[25] = step2[25];
2272  step1[30] = step2[30];
2273  step1[31] = step2[31];
2274
2275  // stage 6
2276  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
2277  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
2278  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
2279  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
2280  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
2281  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
2282  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
2283  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
2284  step2[8] = step1[8];
2285  step2[9] = step1[9];
2286  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2287  temp2 = (step1[10] + step1[13]) * cospi_16_64;
2288  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2289  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2290  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2291  temp2 = (step1[11] + step1[12]) * cospi_16_64;
2292  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2293  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2294  step2[14] = step1[14];
2295  step2[15] = step1[15];
2296
2297  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
2298  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
2299  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
2300  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
2301  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
2302  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
2303  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
2304  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
2305
2306  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
2307  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
2308  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
2309  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
2310  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
2311  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
2312  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
2313  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
2314
2315  // stage 7
2316  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
2317  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
2318  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
2319  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
2320  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
2321  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
2322  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
2323  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
2324  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
2325  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
2326  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
2327  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
2328  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
2329  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
2330  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
2331  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
2332
2333  step1[16] = step2[16];
2334  step1[17] = step2[17];
2335  step1[18] = step2[18];
2336  step1[19] = step2[19];
2337  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2338  temp2 = (step2[20] + step2[27]) * cospi_16_64;
2339  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2340  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2341  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2342  temp2 = (step2[21] + step2[26]) * cospi_16_64;
2343  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2344  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2345  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2346  temp2 = (step2[22] + step2[25]) * cospi_16_64;
2347  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2348  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2349  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2350  temp2 = (step2[23] + step2[24]) * cospi_16_64;
2351  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2352  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2353  step1[28] = step2[28];
2354  step1[29] = step2[29];
2355  step1[30] = step2[30];
2356  step1[31] = step2[31];
2357
2358  // final stage
2359  output[0] = WRAPLOW(step1[0] + step1[31], bd);
2360  output[1] = WRAPLOW(step1[1] + step1[30], bd);
2361  output[2] = WRAPLOW(step1[2] + step1[29], bd);
2362  output[3] = WRAPLOW(step1[3] + step1[28], bd);
2363  output[4] = WRAPLOW(step1[4] + step1[27], bd);
2364  output[5] = WRAPLOW(step1[5] + step1[26], bd);
2365  output[6] = WRAPLOW(step1[6] + step1[25], bd);
2366  output[7] = WRAPLOW(step1[7] + step1[24], bd);
2367  output[8] = WRAPLOW(step1[8] + step1[23], bd);
2368  output[9] = WRAPLOW(step1[9] + step1[22], bd);
2369  output[10] = WRAPLOW(step1[10] + step1[21], bd);
2370  output[11] = WRAPLOW(step1[11] + step1[20], bd);
2371  output[12] = WRAPLOW(step1[12] + step1[19], bd);
2372  output[13] = WRAPLOW(step1[13] + step1[18], bd);
2373  output[14] = WRAPLOW(step1[14] + step1[17], bd);
2374  output[15] = WRAPLOW(step1[15] + step1[16], bd);
2375  output[16] = WRAPLOW(step1[15] - step1[16], bd);
2376  output[17] = WRAPLOW(step1[14] - step1[17], bd);
2377  output[18] = WRAPLOW(step1[13] - step1[18], bd);
2378  output[19] = WRAPLOW(step1[12] - step1[19], bd);
2379  output[20] = WRAPLOW(step1[11] - step1[20], bd);
2380  output[21] = WRAPLOW(step1[10] - step1[21], bd);
2381  output[22] = WRAPLOW(step1[9] - step1[22], bd);
2382  output[23] = WRAPLOW(step1[8] - step1[23], bd);
2383  output[24] = WRAPLOW(step1[7] - step1[24], bd);
2384  output[25] = WRAPLOW(step1[6] - step1[25], bd);
2385  output[26] = WRAPLOW(step1[5] - step1[26], bd);
2386  output[27] = WRAPLOW(step1[4] - step1[27], bd);
2387  output[28] = WRAPLOW(step1[3] - step1[28], bd);
2388  output[29] = WRAPLOW(step1[2] - step1[29], bd);
2389  output[30] = WRAPLOW(step1[1] - step1[30], bd);
2390  output[31] = WRAPLOW(step1[0] - step1[31], bd);
2391}
2392
2393void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2394                                     int stride, int bd) {
2395  tran_low_t out[32 * 32];
2396  tran_low_t *outptr = out;
2397  int i, j;
2398  tran_low_t temp_in[32], temp_out[32];
2399  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2400
2401  // Rows
2402  for (i = 0; i < 32; ++i) {
2403    tran_low_t zero_coeff[16];
2404    for (j = 0; j < 16; ++j)
2405      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2406    for (j = 0; j < 8; ++j)
2407      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2408    for (j = 0; j < 4; ++j)
2409      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2410    for (j = 0; j < 2; ++j)
2411      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2412
2413    if (zero_coeff[0] | zero_coeff[1])
2414      highbd_idct32_c(input, outptr, bd);
2415    else
2416      memset(outptr, 0, sizeof(tran_low_t) * 32);
2417    input += 32;
2418    outptr += 32;
2419  }
2420
2421  // Columns
2422  for (i = 0; i < 32; ++i) {
2423    for (j = 0; j < 32; ++j)
2424      temp_in[j] = out[j * 32 + i];
2425    highbd_idct32_c(temp_in, temp_out, bd);
2426    for (j = 0; j < 32; ++j) {
2427      dest[j * stride + i] = highbd_clip_pixel_add(
2428          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2429    }
2430  }
2431}
2432
2433void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2434                                   int stride, int bd) {
2435  tran_low_t out[32 * 32] = {0};
2436  tran_low_t *outptr = out;
2437  int i, j;
2438  tran_low_t temp_in[32], temp_out[32];
2439  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2440
2441  // Rows
2442  // Only upper-left 8x8 has non-zero coeff.
2443  for (i = 0; i < 8; ++i) {
2444    highbd_idct32_c(input, outptr, bd);
2445    input += 32;
2446    outptr += 32;
2447  }
2448  // Columns
2449  for (i = 0; i < 32; ++i) {
2450    for (j = 0; j < 32; ++j)
2451      temp_in[j] = out[j * 32 + i];
2452    highbd_idct32_c(temp_in, temp_out, bd);
2453    for (j = 0; j < 32; ++j) {
2454      dest[j * stride + i] = highbd_clip_pixel_add(
2455          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2456    }
2457  }
2458}
2459
2460void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2461                                  int stride, int bd) {
2462  int i, j;
2463  int a1;
2464  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2465
2466  tran_low_t out = WRAPLOW(
2467      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2468  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2469  a1 = ROUND_POWER_OF_TWO(out, 6);
2470
2471  for (j = 0; j < 32; ++j) {
2472    for (i = 0; i < 32; ++i)
2473      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2474    dest += stride;
2475  }
2476}
2477#endif  // CONFIG_VP9_HIGHBITDEPTH
2478