1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <math.h>
13
14#include "./vp9_rtcd.h"
15#include "./vpx_config.h"
16#include "./vpx_dsp_rtcd.h"
17
18#include "vp9/common/vp9_blockd.h"
19#include "vp9/common/vp9_idct.h"
20#include "vpx_dsp/fwd_txfm.h"
21#include "vpx_ports/mem.h"
22
23static void fdct4(const tran_low_t *input, tran_low_t *output) {
24  tran_high_t step[4];
25  tran_high_t temp1, temp2;
26
27  step[0] = input[0] + input[3];
28  step[1] = input[1] + input[2];
29  step[2] = input[1] - input[2];
30  step[3] = input[0] - input[3];
31
32  temp1 = (step[0] + step[1]) * cospi_16_64;
33  temp2 = (step[0] - step[1]) * cospi_16_64;
34  output[0] = (tran_low_t)fdct_round_shift(temp1);
35  output[2] = (tran_low_t)fdct_round_shift(temp2);
36  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
37  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
38  output[1] = (tran_low_t)fdct_round_shift(temp1);
39  output[3] = (tran_low_t)fdct_round_shift(temp2);
40}
41
42static void fdct8(const tran_low_t *input, tran_low_t *output) {
43  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
44  tran_high_t t0, t1, t2, t3;                  // needs32
45  tran_high_t x0, x1, x2, x3;                  // canbe16
46
47  // stage 1
48  s0 = input[0] + input[7];
49  s1 = input[1] + input[6];
50  s2 = input[2] + input[5];
51  s3 = input[3] + input[4];
52  s4 = input[3] - input[4];
53  s5 = input[2] - input[5];
54  s6 = input[1] - input[6];
55  s7 = input[0] - input[7];
56
57  // fdct4(step, step);
58  x0 = s0 + s3;
59  x1 = s1 + s2;
60  x2 = s1 - s2;
61  x3 = s0 - s3;
62  t0 = (x0 + x1) * cospi_16_64;
63  t1 = (x0 - x1) * cospi_16_64;
64  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
65  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
66  output[0] = (tran_low_t)fdct_round_shift(t0);
67  output[2] = (tran_low_t)fdct_round_shift(t2);
68  output[4] = (tran_low_t)fdct_round_shift(t1);
69  output[6] = (tran_low_t)fdct_round_shift(t3);
70
71  // Stage 2
72  t0 = (s6 - s5) * cospi_16_64;
73  t1 = (s6 + s5) * cospi_16_64;
74  t2 = (tran_low_t)fdct_round_shift(t0);
75  t3 = (tran_low_t)fdct_round_shift(t1);
76
77  // Stage 3
78  x0 = s4 + t2;
79  x1 = s4 - t2;
80  x2 = s7 - t3;
81  x3 = s7 + t3;
82
83  // Stage 4
84  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
85  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
86  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
87  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
88  output[1] = (tran_low_t)fdct_round_shift(t0);
89  output[3] = (tran_low_t)fdct_round_shift(t2);
90  output[5] = (tran_low_t)fdct_round_shift(t1);
91  output[7] = (tran_low_t)fdct_round_shift(t3);
92}
93
94static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
95  tran_high_t step1[8];      // canbe16
96  tran_high_t step2[8];      // canbe16
97  tran_high_t step3[8];      // canbe16
98  tran_high_t input[8];      // canbe16
99  tran_high_t temp1, temp2;  // needs32
100
101  // step 1
102  input[0] = in[0] + in[15];
103  input[1] = in[1] + in[14];
104  input[2] = in[2] + in[13];
105  input[3] = in[3] + in[12];
106  input[4] = in[4] + in[11];
107  input[5] = in[5] + in[10];
108  input[6] = in[6] + in[ 9];
109  input[7] = in[7] + in[ 8];
110
111  step1[0] = in[7] - in[ 8];
112  step1[1] = in[6] - in[ 9];
113  step1[2] = in[5] - in[10];
114  step1[3] = in[4] - in[11];
115  step1[4] = in[3] - in[12];
116  step1[5] = in[2] - in[13];
117  step1[6] = in[1] - in[14];
118  step1[7] = in[0] - in[15];
119
120  // fdct8(step, step);
121  {
122    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
123    tran_high_t t0, t1, t2, t3;                  // needs32
124    tran_high_t x0, x1, x2, x3;                  // canbe16
125
126    // stage 1
127    s0 = input[0] + input[7];
128    s1 = input[1] + input[6];
129    s2 = input[2] + input[5];
130    s3 = input[3] + input[4];
131    s4 = input[3] - input[4];
132    s5 = input[2] - input[5];
133    s6 = input[1] - input[6];
134    s7 = input[0] - input[7];
135
136    // fdct4(step, step);
137    x0 = s0 + s3;
138    x1 = s1 + s2;
139    x2 = s1 - s2;
140    x3 = s0 - s3;
141    t0 = (x0 + x1) * cospi_16_64;
142    t1 = (x0 - x1) * cospi_16_64;
143    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
144    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
145    out[0] = (tran_low_t)fdct_round_shift(t0);
146    out[4] = (tran_low_t)fdct_round_shift(t2);
147    out[8] = (tran_low_t)fdct_round_shift(t1);
148    out[12] = (tran_low_t)fdct_round_shift(t3);
149
150    // Stage 2
151    t0 = (s6 - s5) * cospi_16_64;
152    t1 = (s6 + s5) * cospi_16_64;
153    t2 = fdct_round_shift(t0);
154    t3 = fdct_round_shift(t1);
155
156    // Stage 3
157    x0 = s4 + t2;
158    x1 = s4 - t2;
159    x2 = s7 - t3;
160    x3 = s7 + t3;
161
162    // Stage 4
163    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
164    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
165    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
166    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
167    out[2] = (tran_low_t)fdct_round_shift(t0);
168    out[6] = (tran_low_t)fdct_round_shift(t2);
169    out[10] = (tran_low_t)fdct_round_shift(t1);
170    out[14] = (tran_low_t)fdct_round_shift(t3);
171  }
172
173  // step 2
174  temp1 = (step1[5] - step1[2]) * cospi_16_64;
175  temp2 = (step1[4] - step1[3]) * cospi_16_64;
176  step2[2] = fdct_round_shift(temp1);
177  step2[3] = fdct_round_shift(temp2);
178  temp1 = (step1[4] + step1[3]) * cospi_16_64;
179  temp2 = (step1[5] + step1[2]) * cospi_16_64;
180  step2[4] = fdct_round_shift(temp1);
181  step2[5] = fdct_round_shift(temp2);
182
183  // step 3
184  step3[0] = step1[0] + step2[3];
185  step3[1] = step1[1] + step2[2];
186  step3[2] = step1[1] - step2[2];
187  step3[3] = step1[0] - step2[3];
188  step3[4] = step1[7] - step2[4];
189  step3[5] = step1[6] - step2[5];
190  step3[6] = step1[6] + step2[5];
191  step3[7] = step1[7] + step2[4];
192
193  // step 4
194  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
195  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
196  step2[1] = fdct_round_shift(temp1);
197  step2[2] = fdct_round_shift(temp2);
198  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
199  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
200  step2[5] = fdct_round_shift(temp1);
201  step2[6] = fdct_round_shift(temp2);
202
203  // step 5
204  step1[0] = step3[0] + step2[1];
205  step1[1] = step3[0] - step2[1];
206  step1[2] = step3[3] + step2[2];
207  step1[3] = step3[3] - step2[2];
208  step1[4] = step3[4] - step2[5];
209  step1[5] = step3[4] + step2[5];
210  step1[6] = step3[7] - step2[6];
211  step1[7] = step3[7] + step2[6];
212
213  // step 6
214  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
215  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
216  out[1] = (tran_low_t)fdct_round_shift(temp1);
217  out[9] = (tran_low_t)fdct_round_shift(temp2);
218
219  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
220  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
221  out[5] = (tran_low_t)fdct_round_shift(temp1);
222  out[13] = (tran_low_t)fdct_round_shift(temp2);
223
224  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
225  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
226  out[3] = (tran_low_t)fdct_round_shift(temp1);
227  out[11] = (tran_low_t)fdct_round_shift(temp2);
228
229  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
230  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
231  out[7] = (tran_low_t)fdct_round_shift(temp1);
232  out[15] = (tran_low_t)fdct_round_shift(temp2);
233}
234
235static void fadst4(const tran_low_t *input, tran_low_t *output) {
236  tran_high_t x0, x1, x2, x3;
237  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
238
239  x0 = input[0];
240  x1 = input[1];
241  x2 = input[2];
242  x3 = input[3];
243
244  if (!(x0 | x1 | x2 | x3)) {
245    output[0] = output[1] = output[2] = output[3] = 0;
246    return;
247  }
248
249  s0 = sinpi_1_9 * x0;
250  s1 = sinpi_4_9 * x0;
251  s2 = sinpi_2_9 * x1;
252  s3 = sinpi_1_9 * x1;
253  s4 = sinpi_3_9 * x2;
254  s5 = sinpi_4_9 * x3;
255  s6 = sinpi_2_9 * x3;
256  s7 = x0 + x1 - x3;
257
258  x0 = s0 + s2 + s5;
259  x1 = sinpi_3_9 * s7;
260  x2 = s1 - s3 + s6;
261  x3 = s4;
262
263  s0 = x0 + x3;
264  s1 = x1;
265  s2 = x2 - x3;
266  s3 = x2 - x0 + x3;
267
268  // 1-D transform scaling factor is sqrt(2).
269  output[0] = (tran_low_t)fdct_round_shift(s0);
270  output[1] = (tran_low_t)fdct_round_shift(s1);
271  output[2] = (tran_low_t)fdct_round_shift(s2);
272  output[3] = (tran_low_t)fdct_round_shift(s3);
273}
274
275static void fadst8(const tran_low_t *input, tran_low_t *output) {
276  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
277
278  tran_high_t x0 = input[7];
279  tran_high_t x1 = input[0];
280  tran_high_t x2 = input[5];
281  tran_high_t x3 = input[2];
282  tran_high_t x4 = input[3];
283  tran_high_t x5 = input[4];
284  tran_high_t x6 = input[1];
285  tran_high_t x7 = input[6];
286
287  // stage 1
288  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
289  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
290  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
291  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
292  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
293  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
294  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
295  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
296
297  x0 = fdct_round_shift(s0 + s4);
298  x1 = fdct_round_shift(s1 + s5);
299  x2 = fdct_round_shift(s2 + s6);
300  x3 = fdct_round_shift(s3 + s7);
301  x4 = fdct_round_shift(s0 - s4);
302  x5 = fdct_round_shift(s1 - s5);
303  x6 = fdct_round_shift(s2 - s6);
304  x7 = fdct_round_shift(s3 - s7);
305
306  // stage 2
307  s0 = x0;
308  s1 = x1;
309  s2 = x2;
310  s3 = x3;
311  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
312  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
313  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
314  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
315
316  x0 = s0 + s2;
317  x1 = s1 + s3;
318  x2 = s0 - s2;
319  x3 = s1 - s3;
320  x4 = fdct_round_shift(s4 + s6);
321  x5 = fdct_round_shift(s5 + s7);
322  x6 = fdct_round_shift(s4 - s6);
323  x7 = fdct_round_shift(s5 - s7);
324
325  // stage 3
326  s2 = cospi_16_64 * (x2 + x3);
327  s3 = cospi_16_64 * (x2 - x3);
328  s6 = cospi_16_64 * (x6 + x7);
329  s7 = cospi_16_64 * (x6 - x7);
330
331  x2 = fdct_round_shift(s2);
332  x3 = fdct_round_shift(s3);
333  x6 = fdct_round_shift(s6);
334  x7 = fdct_round_shift(s7);
335
336  output[0] = (tran_low_t)x0;
337  output[1] = (tran_low_t)-x4;
338  output[2] = (tran_low_t)x6;
339  output[3] = (tran_low_t)-x2;
340  output[4] = (tran_low_t)x3;
341  output[5] = (tran_low_t)-x7;
342  output[6] = (tran_low_t)x5;
343  output[7] = (tran_low_t)-x1;
344}
345
346static void fadst16(const tran_low_t *input, tran_low_t *output) {
347  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
348  tran_high_t s9, s10, s11, s12, s13, s14, s15;
349
350  tran_high_t x0 = input[15];
351  tran_high_t x1 = input[0];
352  tran_high_t x2 = input[13];
353  tran_high_t x3 = input[2];
354  tran_high_t x4 = input[11];
355  tran_high_t x5 = input[4];
356  tran_high_t x6 = input[9];
357  tran_high_t x7 = input[6];
358  tran_high_t x8 = input[7];
359  tran_high_t x9 = input[8];
360  tran_high_t x10 = input[5];
361  tran_high_t x11 = input[10];
362  tran_high_t x12 = input[3];
363  tran_high_t x13 = input[12];
364  tran_high_t x14 = input[1];
365  tran_high_t x15 = input[14];
366
367  // stage 1
368  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
369  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
370  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
371  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
372  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
373  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
374  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
375  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
376  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
377  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
378  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
379  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
380  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
381  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
382  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
383  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
384
385  x0 = fdct_round_shift(s0 + s8);
386  x1 = fdct_round_shift(s1 + s9);
387  x2 = fdct_round_shift(s2 + s10);
388  x3 = fdct_round_shift(s3 + s11);
389  x4 = fdct_round_shift(s4 + s12);
390  x5 = fdct_round_shift(s5 + s13);
391  x6 = fdct_round_shift(s6 + s14);
392  x7 = fdct_round_shift(s7 + s15);
393  x8  = fdct_round_shift(s0 - s8);
394  x9  = fdct_round_shift(s1 - s9);
395  x10 = fdct_round_shift(s2 - s10);
396  x11 = fdct_round_shift(s3 - s11);
397  x12 = fdct_round_shift(s4 - s12);
398  x13 = fdct_round_shift(s5 - s13);
399  x14 = fdct_round_shift(s6 - s14);
400  x15 = fdct_round_shift(s7 - s15);
401
402  // stage 2
403  s0 = x0;
404  s1 = x1;
405  s2 = x2;
406  s3 = x3;
407  s4 = x4;
408  s5 = x5;
409  s6 = x6;
410  s7 = x7;
411  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
412  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
413  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
414  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
415  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
416  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
417  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
418  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
419
420  x0 = s0 + s4;
421  x1 = s1 + s5;
422  x2 = s2 + s6;
423  x3 = s3 + s7;
424  x4 = s0 - s4;
425  x5 = s1 - s5;
426  x6 = s2 - s6;
427  x7 = s3 - s7;
428  x8 = fdct_round_shift(s8 + s12);
429  x9 = fdct_round_shift(s9 + s13);
430  x10 = fdct_round_shift(s10 + s14);
431  x11 = fdct_round_shift(s11 + s15);
432  x12 = fdct_round_shift(s8 - s12);
433  x13 = fdct_round_shift(s9 - s13);
434  x14 = fdct_round_shift(s10 - s14);
435  x15 = fdct_round_shift(s11 - s15);
436
437  // stage 3
438  s0 = x0;
439  s1 = x1;
440  s2 = x2;
441  s3 = x3;
442  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
443  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
444  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
445  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
446  s8 = x8;
447  s9 = x9;
448  s10 = x10;
449  s11 = x11;
450  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
451  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
452  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
453  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
454
455  x0 = s0 + s2;
456  x1 = s1 + s3;
457  x2 = s0 - s2;
458  x3 = s1 - s3;
459  x4 = fdct_round_shift(s4 + s6);
460  x5 = fdct_round_shift(s5 + s7);
461  x6 = fdct_round_shift(s4 - s6);
462  x7 = fdct_round_shift(s5 - s7);
463  x8 = s8 + s10;
464  x9 = s9 + s11;
465  x10 = s8 - s10;
466  x11 = s9 - s11;
467  x12 = fdct_round_shift(s12 + s14);
468  x13 = fdct_round_shift(s13 + s15);
469  x14 = fdct_round_shift(s12 - s14);
470  x15 = fdct_round_shift(s13 - s15);
471
472  // stage 4
473  s2 = (- cospi_16_64) * (x2 + x3);
474  s3 = cospi_16_64 * (x2 - x3);
475  s6 = cospi_16_64 * (x6 + x7);
476  s7 = cospi_16_64 * (- x6 + x7);
477  s10 = cospi_16_64 * (x10 + x11);
478  s11 = cospi_16_64 * (- x10 + x11);
479  s14 = (- cospi_16_64) * (x14 + x15);
480  s15 = cospi_16_64 * (x14 - x15);
481
482  x2 = fdct_round_shift(s2);
483  x3 = fdct_round_shift(s3);
484  x6 = fdct_round_shift(s6);
485  x7 = fdct_round_shift(s7);
486  x10 = fdct_round_shift(s10);
487  x11 = fdct_round_shift(s11);
488  x14 = fdct_round_shift(s14);
489  x15 = fdct_round_shift(s15);
490
491  output[0] = (tran_low_t)x0;
492  output[1] = (tran_low_t)-x8;
493  output[2] = (tran_low_t)x12;
494  output[3] = (tran_low_t)-x4;
495  output[4] = (tran_low_t)x6;
496  output[5] = (tran_low_t)x14;
497  output[6] = (tran_low_t)x10;
498  output[7] = (tran_low_t)x2;
499  output[8] = (tran_low_t)x3;
500  output[9] = (tran_low_t)x11;
501  output[10] = (tran_low_t)x15;
502  output[11] = (tran_low_t)x7;
503  output[12] = (tran_low_t)x5;
504  output[13] = (tran_low_t)-x13;
505  output[14] = (tran_low_t)x9;
506  output[15] = (tran_low_t)-x1;
507}
508
509static const transform_2d FHT_4[] = {
510  { fdct4,  fdct4  },  // DCT_DCT  = 0
511  { fadst4, fdct4  },  // ADST_DCT = 1
512  { fdct4,  fadst4 },  // DCT_ADST = 2
513  { fadst4, fadst4 }   // ADST_ADST = 3
514};
515
516static const transform_2d FHT_8[] = {
517  { fdct8,  fdct8  },  // DCT_DCT  = 0
518  { fadst8, fdct8  },  // ADST_DCT = 1
519  { fdct8,  fadst8 },  // DCT_ADST = 2
520  { fadst8, fadst8 }   // ADST_ADST = 3
521};
522
523static const transform_2d FHT_16[] = {
524  { fdct16,  fdct16  },  // DCT_DCT  = 0
525  { fadst16, fdct16  },  // ADST_DCT = 1
526  { fdct16,  fadst16 },  // DCT_ADST = 2
527  { fadst16, fadst16 }   // ADST_ADST = 3
528};
529
530void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
531                  int stride, int tx_type) {
532  if (tx_type == DCT_DCT) {
533    vpx_fdct4x4_c(input, output, stride);
534  } else {
535    tran_low_t out[4 * 4];
536    int i, j;
537    tran_low_t temp_in[4], temp_out[4];
538    const transform_2d ht = FHT_4[tx_type];
539
540    // Columns
541    for (i = 0; i < 4; ++i) {
542      for (j = 0; j < 4; ++j)
543        temp_in[j] = input[j * stride + i] * 16;
544      if (i == 0 && temp_in[0])
545        temp_in[0] += 1;
546      ht.cols(temp_in, temp_out);
547      for (j = 0; j < 4; ++j)
548        out[j * 4 + i] = temp_out[j];
549    }
550
551    // Rows
552    for (i = 0; i < 4; ++i) {
553      for (j = 0; j < 4; ++j)
554        temp_in[j] = out[j + i * 4];
555      ht.rows(temp_in, temp_out);
556      for (j = 0; j < 4; ++j)
557        output[j + i * 4] = (temp_out[j] + 1) >> 2;
558    }
559  }
560}
561
562void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
563                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
564                         int skip_block,
565                         const int16_t *zbin_ptr, const int16_t *round_ptr,
566                         const int16_t *quant_ptr,
567                         const int16_t *quant_shift_ptr,
568                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
569                         const int16_t *dequant_ptr,
570                         uint16_t *eob_ptr,
571                         const int16_t *scan, const int16_t *iscan) {
572  int eob = -1;
573
574  int i, j;
575  tran_low_t intermediate[64];
576
577  // Transform columns
578  {
579    tran_low_t *output = intermediate;
580    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
581    tran_high_t t0, t1, t2, t3;                  // needs32
582    tran_high_t x0, x1, x2, x3;                  // canbe16
583
584    int i;
585    for (i = 0; i < 8; i++) {
586      // stage 1
587      s0 = (input[0 * stride] + input[7 * stride]) * 4;
588      s1 = (input[1 * stride] + input[6 * stride]) * 4;
589      s2 = (input[2 * stride] + input[5 * stride]) * 4;
590      s3 = (input[3 * stride] + input[4 * stride]) * 4;
591      s4 = (input[3 * stride] - input[4 * stride]) * 4;
592      s5 = (input[2 * stride] - input[5 * stride]) * 4;
593      s6 = (input[1 * stride] - input[6 * stride]) * 4;
594      s7 = (input[0 * stride] - input[7 * stride]) * 4;
595
596      // fdct4(step, step);
597      x0 = s0 + s3;
598      x1 = s1 + s2;
599      x2 = s1 - s2;
600      x3 = s0 - s3;
601      t0 = (x0 + x1) * cospi_16_64;
602      t1 = (x0 - x1) * cospi_16_64;
603      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
604      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
605      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
606      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
607      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
608      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
609
610      // Stage 2
611      t0 = (s6 - s5) * cospi_16_64;
612      t1 = (s6 + s5) * cospi_16_64;
613      t2 = fdct_round_shift(t0);
614      t3 = fdct_round_shift(t1);
615
616      // Stage 3
617      x0 = s4 + t2;
618      x1 = s4 - t2;
619      x2 = s7 - t3;
620      x3 = s7 + t3;
621
622      // Stage 4
623      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
624      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
625      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
626      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
627      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
628      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
629      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
630      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
631      input++;
632      output++;
633    }
634  }
635
636  // Rows
637  for (i = 0; i < 8; ++i) {
638    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
639    for (j = 0; j < 8; ++j)
640      coeff_ptr[j + i * 8] /= 2;
641  }
642
643  // TODO(jingning) Decide the need of these arguments after the
644  // quantization process is completed.
645  (void)zbin_ptr;
646  (void)quant_shift_ptr;
647  (void)iscan;
648
649  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
650  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
651
652  if (!skip_block) {
653    // Quantization pass: All coefficients with index >= zero_flag are
654    // skippable. Note: zero_flag can be zero.
655    for (i = 0; i < n_coeffs; i++) {
656      const int rc = scan[i];
657      const int coeff = coeff_ptr[rc];
658      const int coeff_sign = (coeff >> 31);
659      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
660
661      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
662      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
663
664      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
665      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
666
667      if (tmp)
668        eob = i;
669    }
670  }
671  *eob_ptr = eob + 1;
672}
673
674void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
675                  int stride, int tx_type) {
676  if (tx_type == DCT_DCT) {
677    vpx_fdct8x8_c(input, output, stride);
678  } else {
679    tran_low_t out[64];
680    int i, j;
681    tran_low_t temp_in[8], temp_out[8];
682    const transform_2d ht = FHT_8[tx_type];
683
684    // Columns
685    for (i = 0; i < 8; ++i) {
686      for (j = 0; j < 8; ++j)
687        temp_in[j] = input[j * stride + i] * 4;
688      ht.cols(temp_in, temp_out);
689      for (j = 0; j < 8; ++j)
690        out[j * 8 + i] = temp_out[j];
691    }
692
693    // Rows
694    for (i = 0; i < 8; ++i) {
695      for (j = 0; j < 8; ++j)
696        temp_in[j] = out[j + i * 8];
697      ht.rows(temp_in, temp_out);
698      for (j = 0; j < 8; ++j)
699        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
700    }
701  }
702}
703
704/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
705   pixel. */
706void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
707  int i;
708  tran_high_t a1, b1, c1, d1, e1;
709  const int16_t *ip_pass0 = input;
710  const tran_low_t *ip = NULL;
711  tran_low_t *op = output;
712
713  for (i = 0; i < 4; i++) {
714    a1 = ip_pass0[0 * stride];
715    b1 = ip_pass0[1 * stride];
716    c1 = ip_pass0[2 * stride];
717    d1 = ip_pass0[3 * stride];
718
719    a1 += b1;
720    d1 = d1 - c1;
721    e1 = (a1 - d1) >> 1;
722    b1 = e1 - b1;
723    c1 = e1 - c1;
724    a1 -= c1;
725    d1 += b1;
726    op[0] = (tran_low_t)a1;
727    op[4] = (tran_low_t)c1;
728    op[8] = (tran_low_t)d1;
729    op[12] = (tran_low_t)b1;
730
731    ip_pass0++;
732    op++;
733  }
734  ip = output;
735  op = output;
736
737  for (i = 0; i < 4; i++) {
738    a1 = ip[0];
739    b1 = ip[1];
740    c1 = ip[2];
741    d1 = ip[3];
742
743    a1 += b1;
744    d1 -= c1;
745    e1 = (a1 - d1) >> 1;
746    b1 = e1 - b1;
747    c1 = e1 - c1;
748    a1 -= c1;
749    d1 += b1;
750    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
751    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
752    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
753    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
754
755    ip += 4;
756    op += 4;
757  }
758}
759
760void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
761                    int stride, int tx_type) {
762  if (tx_type == DCT_DCT) {
763    vpx_fdct16x16_c(input, output, stride);
764  } else {
765    tran_low_t out[256];
766    int i, j;
767    tran_low_t temp_in[16], temp_out[16];
768    const transform_2d ht = FHT_16[tx_type];
769
770    // Columns
771    for (i = 0; i < 16; ++i) {
772      for (j = 0; j < 16; ++j)
773        temp_in[j] = input[j * stride + i] * 4;
774      ht.cols(temp_in, temp_out);
775      for (j = 0; j < 16; ++j)
776        out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
777    }
778
779    // Rows
780    for (i = 0; i < 16; ++i) {
781      for (j = 0; j < 16; ++j)
782        temp_in[j] = out[j + i * 16];
783      ht.rows(temp_in, temp_out);
784      for (j = 0; j < 16; ++j)
785        output[j + i * 16] = temp_out[j];
786    }
787  }
788}
789
790#if CONFIG_VP9_HIGHBITDEPTH
791void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output,
792                         int stride, int tx_type) {
793  vp9_fht4x4_c(input, output, stride, tx_type);
794}
795
796void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output,
797                         int stride, int tx_type) {
798  vp9_fht8x8_c(input, output, stride, tx_type);
799}
800
801void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
802                          int stride) {
803  vp9_fwht4x4_c(input, output, stride);
804}
805
806void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output,
807                           int stride, int tx_type) {
808  vp9_fht16x16_c(input, output, stride, tx_type);
809}
810#endif  // CONFIG_VP9_HIGHBITDEPTH
811