1/*
2 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12
13#include "./vpx_config.h"
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/arm/idct_neon.h"
16#include "vpx_dsp/arm/transpose_neon.h"
17#include "vpx_dsp/txfm_common.h"
18
19// Only for the first pass of the  _34_ variant. Since it only uses values from
20// the top left 8x8 it can safely assume all the remaining values are 0 and skip
21// an awful lot of calculations. In fact, only the first 6 columns make the cut.
22// None of the elements in the 7th or 8th column are used so it skips any calls
23// to input[67] too.
24// In C this does a single row of 32 for each call. Here it transposes the top
25// left 8x8 to allow using SIMD.
26
27// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
28// coefficients as follows:
29//    0  1  2  3  4  5  6  7
30// 0  0  2  5 10 17 25
31// 1  1  4  8 15 22 30
32// 2  3  7 12 18 28
33// 3  6 11 16 23 31
34// 4  9 14 19 29
35// 5 13 20 26
36// 6 21 27 33
37// 7 24 32
38static void vpx_highbd_idct32_6_neon(const tran_low_t *input, int32_t *output) {
39  int32x4x2_t in[8], s1[32], s2[32], s3[32];
40
41  in[0].val[0] = vld1q_s32(input);
42  in[0].val[1] = vld1q_s32(input + 4);
43  input += 32;
44  in[1].val[0] = vld1q_s32(input);
45  in[1].val[1] = vld1q_s32(input + 4);
46  input += 32;
47  in[2].val[0] = vld1q_s32(input);
48  in[2].val[1] = vld1q_s32(input + 4);
49  input += 32;
50  in[3].val[0] = vld1q_s32(input);
51  in[3].val[1] = vld1q_s32(input + 4);
52  input += 32;
53  in[4].val[0] = vld1q_s32(input);
54  in[4].val[1] = vld1q_s32(input + 4);
55  input += 32;
56  in[5].val[0] = vld1q_s32(input);
57  in[5].val[1] = vld1q_s32(input + 4);
58  input += 32;
59  in[6].val[0] = vld1q_s32(input);
60  in[6].val[1] = vld1q_s32(input + 4);
61  input += 32;
62  in[7].val[0] = vld1q_s32(input);
63  in[7].val[1] = vld1q_s32(input + 4);
64  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
65                    &in[7]);
66
67  // stage 1
68  // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
69  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
70  // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
71  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
72
73  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
74  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
75
76  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
77  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
78
79  // stage 2
80  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
81  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
82
83  // stage 3
84  s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
85  s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
86
87  s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
88                                                         s1[31], cospi_28_64);
89  s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
90                                                         s1[31], cospi_4_64);
91
92  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
93                                                         s1[27], cospi_12_64);
94  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
95                                                         s1[27], cospi_20_64);
96
97  s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
98                                                         s1[24], -cospi_20_64);
99  s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
100                                                         s1[24], cospi_12_64);
101
102  // stage 4
103  s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
104
105  s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
106                                                        s2[15], cospi_24_64);
107  s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
108                                                         s2[15], cospi_8_64);
109
110  s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
111  s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
112  s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
113  s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
114  s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
115  s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
116  s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
117  s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
118
119  // stage 5
120  s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
121  s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
122
123  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], -cospi_8_64,
124                                                         s1[30], cospi_24_64);
125  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], cospi_24_64,
126                                                         s1[30], cospi_8_64);
127
128  s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_8_64,
129                                                         s1[31], cospi_24_64);
130  s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_24_64,
131                                                         s1[31], cospi_8_64);
132
133  s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
134                                                         s2[27], -cospi_8_64);
135  s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
136                                                         s2[27], cospi_24_64);
137
138  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
139                                                         s2[26], -cospi_8_64);
140  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
141                                                         s2[26], cospi_24_64);
142
143  // stage 6
144  s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
145  s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
146  s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
147  s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
148  s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
149  s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
150  s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
151  s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
152
153  s2[10] = sub_multiply_shift_and_narrow_s32_dual(s2[14], s2[9], cospi_16_64);
154  s2[13] = add_multiply_shift_and_narrow_s32_dual(s2[9], s2[14], cospi_16_64);
155
156  s2[11] = sub_multiply_shift_and_narrow_s32_dual(s2[15], s2[8], cospi_16_64);
157  s2[12] = add_multiply_shift_and_narrow_s32_dual(s2[8], s2[15], cospi_16_64);
158
159  s2[16] = highbd_idct_add_dual(s1[16], s2[23]);
160  s2[17] = highbd_idct_add_dual(s1[17], s2[22]);
161  s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
162  s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
163  s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
164  s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
165  s2[22] = highbd_idct_sub_dual(s1[17], s2[22]);
166  s2[23] = highbd_idct_sub_dual(s1[16], s2[23]);
167
168  s3[24] = highbd_idct_sub_dual(s1[31], s2[24]);
169  s3[25] = highbd_idct_sub_dual(s1[30], s2[25]);
170  s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
171  s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
172  s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
173  s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
174  s2[30] = highbd_idct_add_dual(s2[25], s1[30]);
175  s2[31] = highbd_idct_add_dual(s2[24], s1[31]);
176
177  // stage 7
178  s1[0] = highbd_idct_add_dual(s2[0], s2[15]);
179  s1[1] = highbd_idct_add_dual(s2[1], s2[14]);
180  s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
181  s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
182  s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
183  s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
184  s1[6] = highbd_idct_add_dual(s2[6], s2[9]);
185  s1[7] = highbd_idct_add_dual(s2[7], s2[8]);
186  s1[8] = highbd_idct_sub_dual(s2[7], s2[8]);
187  s1[9] = highbd_idct_sub_dual(s2[6], s2[9]);
188  s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
189  s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
190  s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
191  s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
192  s1[14] = highbd_idct_sub_dual(s2[1], s2[14]);
193  s1[15] = highbd_idct_sub_dual(s2[0], s2[15]);
194
195  s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
196  s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
197
198  s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
199  s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
200
201  s1[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s2[22], cospi_16_64);
202  s1[25] = add_multiply_shift_and_narrow_s32_dual(s2[22], s3[25], cospi_16_64);
203
204  s1[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s2[23], cospi_16_64);
205  s1[24] = add_multiply_shift_and_narrow_s32_dual(s2[23], s3[24], cospi_16_64);
206
207  // final stage
208  s3[0] = highbd_idct_add_dual(s1[0], s2[31]);
209  s3[1] = highbd_idct_add_dual(s1[1], s2[30]);
210  s3[2] = highbd_idct_add_dual(s1[2], s2[29]);
211  s3[3] = highbd_idct_add_dual(s1[3], s2[28]);
212  s3[4] = highbd_idct_add_dual(s1[4], s1[27]);
213  s3[5] = highbd_idct_add_dual(s1[5], s1[26]);
214  s3[6] = highbd_idct_add_dual(s1[6], s1[25]);
215  s3[7] = highbd_idct_add_dual(s1[7], s1[24]);
216  s3[8] = highbd_idct_add_dual(s1[8], s1[23]);
217  s3[9] = highbd_idct_add_dual(s1[9], s1[22]);
218  s3[10] = highbd_idct_add_dual(s1[10], s1[21]);
219  s3[11] = highbd_idct_add_dual(s1[11], s1[20]);
220  s3[12] = highbd_idct_add_dual(s1[12], s2[19]);
221  s3[13] = highbd_idct_add_dual(s1[13], s2[18]);
222  s3[14] = highbd_idct_add_dual(s1[14], s2[17]);
223  s3[15] = highbd_idct_add_dual(s1[15], s2[16]);
224  s3[16] = highbd_idct_sub_dual(s1[15], s2[16]);
225  s3[17] = highbd_idct_sub_dual(s1[14], s2[17]);
226  s3[18] = highbd_idct_sub_dual(s1[13], s2[18]);
227  s3[19] = highbd_idct_sub_dual(s1[12], s2[19]);
228  s3[20] = highbd_idct_sub_dual(s1[11], s1[20]);
229  s3[21] = highbd_idct_sub_dual(s1[10], s1[21]);
230  s3[22] = highbd_idct_sub_dual(s1[9], s1[22]);
231  s3[23] = highbd_idct_sub_dual(s1[8], s1[23]);
232  s3[24] = highbd_idct_sub_dual(s1[7], s1[24]);
233  s3[25] = highbd_idct_sub_dual(s1[6], s1[25]);
234  s3[26] = highbd_idct_sub_dual(s1[5], s1[26]);
235  s3[27] = highbd_idct_sub_dual(s1[4], s1[27]);
236  s3[28] = highbd_idct_sub_dual(s1[3], s2[28]);
237  s3[29] = highbd_idct_sub_dual(s1[2], s2[29]);
238  s3[30] = highbd_idct_sub_dual(s1[1], s2[30]);
239  s3[31] = highbd_idct_sub_dual(s1[0], s2[31]);
240
241  vst1q_s32(output, s3[0].val[0]);
242  output += 4;
243  vst1q_s32(output, s3[0].val[1]);
244  output += 4;
245  vst1q_s32(output, s3[1].val[0]);
246  output += 4;
247  vst1q_s32(output, s3[1].val[1]);
248  output += 4;
249  vst1q_s32(output, s3[2].val[0]);
250  output += 4;
251  vst1q_s32(output, s3[2].val[1]);
252  output += 4;
253  vst1q_s32(output, s3[3].val[0]);
254  output += 4;
255  vst1q_s32(output, s3[3].val[1]);
256  output += 4;
257  vst1q_s32(output, s3[4].val[0]);
258  output += 4;
259  vst1q_s32(output, s3[4].val[1]);
260  output += 4;
261  vst1q_s32(output, s3[5].val[0]);
262  output += 4;
263  vst1q_s32(output, s3[5].val[1]);
264  output += 4;
265  vst1q_s32(output, s3[6].val[0]);
266  output += 4;
267  vst1q_s32(output, s3[6].val[1]);
268  output += 4;
269  vst1q_s32(output, s3[7].val[0]);
270  output += 4;
271  vst1q_s32(output, s3[7].val[1]);
272  output += 4;
273
274  vst1q_s32(output, s3[8].val[0]);
275  output += 4;
276  vst1q_s32(output, s3[8].val[1]);
277  output += 4;
278  vst1q_s32(output, s3[9].val[0]);
279  output += 4;
280  vst1q_s32(output, s3[9].val[1]);
281  output += 4;
282  vst1q_s32(output, s3[10].val[0]);
283  output += 4;
284  vst1q_s32(output, s3[10].val[1]);
285  output += 4;
286  vst1q_s32(output, s3[11].val[0]);
287  output += 4;
288  vst1q_s32(output, s3[11].val[1]);
289  output += 4;
290  vst1q_s32(output, s3[12].val[0]);
291  output += 4;
292  vst1q_s32(output, s3[12].val[1]);
293  output += 4;
294  vst1q_s32(output, s3[13].val[0]);
295  output += 4;
296  vst1q_s32(output, s3[13].val[1]);
297  output += 4;
298  vst1q_s32(output, s3[14].val[0]);
299  output += 4;
300  vst1q_s32(output, s3[14].val[1]);
301  output += 4;
302  vst1q_s32(output, s3[15].val[0]);
303  output += 4;
304  vst1q_s32(output, s3[15].val[1]);
305  output += 4;
306
307  vst1q_s32(output, s3[16].val[0]);
308  output += 4;
309  vst1q_s32(output, s3[16].val[1]);
310  output += 4;
311  vst1q_s32(output, s3[17].val[0]);
312  output += 4;
313  vst1q_s32(output, s3[17].val[1]);
314  output += 4;
315  vst1q_s32(output, s3[18].val[0]);
316  output += 4;
317  vst1q_s32(output, s3[18].val[1]);
318  output += 4;
319  vst1q_s32(output, s3[19].val[0]);
320  output += 4;
321  vst1q_s32(output, s3[19].val[1]);
322  output += 4;
323  vst1q_s32(output, s3[20].val[0]);
324  output += 4;
325  vst1q_s32(output, s3[20].val[1]);
326  output += 4;
327  vst1q_s32(output, s3[21].val[0]);
328  output += 4;
329  vst1q_s32(output, s3[21].val[1]);
330  output += 4;
331  vst1q_s32(output, s3[22].val[0]);
332  output += 4;
333  vst1q_s32(output, s3[22].val[1]);
334  output += 4;
335  vst1q_s32(output, s3[23].val[0]);
336  output += 4;
337  vst1q_s32(output, s3[23].val[1]);
338  output += 4;
339
340  vst1q_s32(output, s3[24].val[0]);
341  output += 4;
342  vst1q_s32(output, s3[24].val[1]);
343  output += 4;
344  vst1q_s32(output, s3[25].val[0]);
345  output += 4;
346  vst1q_s32(output, s3[25].val[1]);
347  output += 4;
348  vst1q_s32(output, s3[26].val[0]);
349  output += 4;
350  vst1q_s32(output, s3[26].val[1]);
351  output += 4;
352  vst1q_s32(output, s3[27].val[0]);
353  output += 4;
354  vst1q_s32(output, s3[27].val[1]);
355  output += 4;
356  vst1q_s32(output, s3[28].val[0]);
357  output += 4;
358  vst1q_s32(output, s3[28].val[1]);
359  output += 4;
360  vst1q_s32(output, s3[29].val[0]);
361  output += 4;
362  vst1q_s32(output, s3[29].val[1]);
363  output += 4;
364  vst1q_s32(output, s3[30].val[0]);
365  output += 4;
366  vst1q_s32(output, s3[30].val[1]);
367  output += 4;
368  vst1q_s32(output, s3[31].val[0]);
369  output += 4;
370  vst1q_s32(output, s3[31].val[1]);
371}
372
373static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output,
374                                     int stride, const int bd) {
375  int32x4x2_t in[8], s1[32], s2[32], s3[32], out[32];
376
377  load_and_transpose_s32_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
378                             &in[5], &in[6], &in[7]);
379
380  // stage 1
381  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
382  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
383
384  // Different for _8_
385  s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
386  s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
387
388  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
389  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
390
391  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
392  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
393
394  // stage 2
395  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
396  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
397
398  s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
399  s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
400
401  // stage 3
402  s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
403  s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
404
405  s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
406                                                         s1[31], cospi_28_64);
407  s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
408                                                         s1[31], cospi_4_64);
409
410  // Different for _8_
411  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_28_64,
412                                                         s1[28], -cospi_4_64);
413  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_4_64,
414                                                         s1[28], cospi_28_64);
415
416  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
417                                                         s1[27], cospi_12_64);
418  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
419                                                         s1[27], cospi_20_64);
420
421  s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
422                                                         s1[24], -cospi_20_64);
423  s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
424                                                         s1[24], cospi_12_64);
425
426  // stage 4
427  s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
428
429  s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
430                                                        s2[15], cospi_24_64);
431  s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
432                                                         s2[15], cospi_8_64);
433
434  s2[10] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_24_64,
435                                                         s2[12], -cospi_8_64);
436  s2[13] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_8_64,
437                                                         s2[12], cospi_24_64);
438
439  s2[16] = highbd_idct_add_dual(s1[16], s1[19]);
440
441  s2[17] = highbd_idct_add_dual(s1[17], s1[18]);
442  s2[18] = highbd_idct_sub_dual(s1[17], s1[18]);
443
444  s2[19] = highbd_idct_sub_dual(s1[16], s1[19]);
445
446  s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
447  s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
448
449  s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
450  s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
451
452  s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
453  s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
454  s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
455  s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
456
457  s2[28] = highbd_idct_sub_dual(s1[31], s1[28]);
458  s2[29] = highbd_idct_sub_dual(s1[30], s1[29]);
459  s2[30] = highbd_idct_add_dual(s1[29], s1[30]);
460  s2[31] = highbd_idct_add_dual(s1[28], s1[31]);
461
462  // stage 5
463  s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
464  s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
465
466  s1[8] = highbd_idct_add_dual(s2[8], s2[11]);
467  s1[9] = highbd_idct_add_dual(s2[9], s2[10]);
468  s1[10] = highbd_idct_sub_dual(s2[9], s2[10]);
469  s1[11] = highbd_idct_sub_dual(s2[8], s2[11]);
470  s1[12] = highbd_idct_sub_dual(s2[15], s2[12]);
471  s1[13] = highbd_idct_sub_dual(s2[14], s2[13]);
472  s1[14] = highbd_idct_add_dual(s2[13], s2[14]);
473  s1[15] = highbd_idct_add_dual(s2[12], s2[15]);
474
475  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_8_64,
476                                                         s2[29], cospi_24_64);
477  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], cospi_24_64,
478                                                         s2[29], cospi_8_64);
479
480  s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], -cospi_8_64,
481                                                         s2[28], cospi_24_64);
482  s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], cospi_24_64,
483                                                         s2[28], cospi_8_64);
484
485  s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
486                                                         s2[27], -cospi_8_64);
487  s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
488                                                         s2[27], cospi_24_64);
489
490  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
491                                                         s2[26], -cospi_8_64);
492  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
493                                                         s2[26], cospi_24_64);
494
495  // stage 6
496  s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
497  s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
498  s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
499  s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
500  s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
501  s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
502  s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
503  s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
504
505  s2[10] = sub_multiply_shift_and_narrow_s32_dual(s1[13], s1[10], cospi_16_64);
506  s2[13] = add_multiply_shift_and_narrow_s32_dual(s1[10], s1[13], cospi_16_64);
507
508  s2[11] = sub_multiply_shift_and_narrow_s32_dual(s1[12], s1[11], cospi_16_64);
509  s2[12] = add_multiply_shift_and_narrow_s32_dual(s1[11], s1[12], cospi_16_64);
510
511  s1[16] = highbd_idct_add_dual(s2[16], s2[23]);
512  s1[17] = highbd_idct_add_dual(s2[17], s2[22]);
513  s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
514  s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
515  s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
516  s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
517  s1[22] = highbd_idct_sub_dual(s2[17], s2[22]);
518  s1[23] = highbd_idct_sub_dual(s2[16], s2[23]);
519
520  s3[24] = highbd_idct_sub_dual(s2[31], s2[24]);
521  s3[25] = highbd_idct_sub_dual(s2[30], s2[25]);
522  s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
523  s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
524  s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
525  s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
526  s2[30] = highbd_idct_add_dual(s2[25], s2[30]);
527  s2[31] = highbd_idct_add_dual(s2[24], s2[31]);
528
529  // stage 7
530  s1[0] = highbd_idct_add_dual(s2[0], s1[15]);
531  s1[1] = highbd_idct_add_dual(s2[1], s1[14]);
532  s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
533  s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
534  s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
535  s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
536  s1[6] = highbd_idct_add_dual(s2[6], s1[9]);
537  s1[7] = highbd_idct_add_dual(s2[7], s1[8]);
538  s1[8] = highbd_idct_sub_dual(s2[7], s1[8]);
539  s1[9] = highbd_idct_sub_dual(s2[6], s1[9]);
540  s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
541  s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
542  s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
543  s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
544  s1[14] = highbd_idct_sub_dual(s2[1], s1[14]);
545  s1[15] = highbd_idct_sub_dual(s2[0], s1[15]);
546
547  s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
548  s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
549
550  s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
551  s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
552
553  s2[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s1[22], cospi_16_64);
554  s1[25] = add_multiply_shift_and_narrow_s32_dual(s1[22], s3[25], cospi_16_64);
555
556  s2[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s1[23], cospi_16_64);
557  s1[24] = add_multiply_shift_and_narrow_s32_dual(s1[23], s3[24], cospi_16_64);
558
559  // final stage
560  out[0] = highbd_idct_add_dual(s1[0], s2[31]);
561  out[1] = highbd_idct_add_dual(s1[1], s2[30]);
562  out[2] = highbd_idct_add_dual(s1[2], s2[29]);
563  out[3] = highbd_idct_add_dual(s1[3], s2[28]);
564  out[4] = highbd_idct_add_dual(s1[4], s1[27]);
565  out[5] = highbd_idct_add_dual(s1[5], s1[26]);
566  out[6] = highbd_idct_add_dual(s1[6], s1[25]);
567  out[7] = highbd_idct_add_dual(s1[7], s1[24]);
568  out[8] = highbd_idct_add_dual(s1[8], s2[23]);
569  out[9] = highbd_idct_add_dual(s1[9], s2[22]);
570  out[10] = highbd_idct_add_dual(s1[10], s1[21]);
571  out[11] = highbd_idct_add_dual(s1[11], s1[20]);
572  out[12] = highbd_idct_add_dual(s1[12], s2[19]);
573  out[13] = highbd_idct_add_dual(s1[13], s2[18]);
574  out[14] = highbd_idct_add_dual(s1[14], s1[17]);
575  out[15] = highbd_idct_add_dual(s1[15], s1[16]);
576  out[16] = highbd_idct_sub_dual(s1[15], s1[16]);
577  out[17] = highbd_idct_sub_dual(s1[14], s1[17]);
578  out[18] = highbd_idct_sub_dual(s1[13], s2[18]);
579  out[19] = highbd_idct_sub_dual(s1[12], s2[19]);
580  out[20] = highbd_idct_sub_dual(s1[11], s1[20]);
581  out[21] = highbd_idct_sub_dual(s1[10], s1[21]);
582  out[22] = highbd_idct_sub_dual(s1[9], s2[22]);
583  out[23] = highbd_idct_sub_dual(s1[8], s2[23]);
584  out[24] = highbd_idct_sub_dual(s1[7], s1[24]);
585  out[25] = highbd_idct_sub_dual(s1[6], s1[25]);
586  out[26] = highbd_idct_sub_dual(s1[5], s1[26]);
587  out[27] = highbd_idct_sub_dual(s1[4], s1[27]);
588  out[28] = highbd_idct_sub_dual(s1[3], s2[28]);
589  out[29] = highbd_idct_sub_dual(s1[2], s2[29]);
590  out[30] = highbd_idct_sub_dual(s1[1], s2[30]);
591  out[31] = highbd_idct_sub_dual(s1[0], s2[31]);
592
593  highbd_idct16x16_add_store(out, output, stride, bd);
594  highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
595}
596
597void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest,
598                                      int stride, int bd) {
599  int i;
600
601  if (bd == 8) {
602    int16_t temp[32 * 8];
603    int16_t *t = temp;
604
605    vpx_idct32_6_neon(input, t);
606
607    for (i = 0; i < 32; i += 8) {
608      vpx_idct32_8_neon(t, dest, stride, 1);
609      t += (8 * 8);
610      dest += 8;
611    }
612  } else {
613    int32_t temp[32 * 8];
614    int32_t *t = temp;
615
616    vpx_highbd_idct32_6_neon(input, t);
617
618    for (i = 0; i < 32; i += 8) {
619      vpx_highbd_idct32_8_neon(t, dest, stride, bd);
620      t += (8 * 8);
621      dest += 8;
622    }
623  }
624}
625