1/*
2 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12
13#include "./vpx_config.h"
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/txfm_common.h"
16#include "vpx_dsp/arm/mem_neon.h"
17#include "vpx_dsp/arm/transpose_neon.h"
18
19// Most gcc 4.9 distributions outside of Android do not generate correct code
20// for this function.
21#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
22    __GNUC__ == 4 && __GNUC_MINOR__ <= 9
23
24void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
25  vpx_fdct32x32_c(input, output, stride);
26}
27
28void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
29                           int stride) {
30  vpx_fdct32x32_rd_c(input, output, stride);
31}
32
33#else
34
35#define LOAD_INCREMENT(src, stride, dest, index) \
36  do {                                           \
37    dest[index] = vld1q_s16(src);                \
38    src += stride;                               \
39  } while (0)
40
41#define ADD_S16(src, index0, index1, dest, index3)      \
42  do {                                                  \
43    dest[index3] = vaddq_s16(src[index0], src[index1]); \
44  } while (0)
45
46#define ADD_SHIFT_S16(src, index0, index1)                             \
47  do {                                                                 \
48    src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
49  } while (0)
50
51// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
52// middle
53// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
54static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
55  const int16_t *a_end = a + 24 * stride;
56  int16x8_t c[8];
57
58  LOAD_INCREMENT(a, stride, b, 0);
59  LOAD_INCREMENT(a, stride, b, 1);
60  LOAD_INCREMENT(a, stride, b, 2);
61  LOAD_INCREMENT(a, stride, b, 3);
62  LOAD_INCREMENT(a, stride, b, 4);
63  LOAD_INCREMENT(a, stride, b, 5);
64  LOAD_INCREMENT(a, stride, b, 6);
65  LOAD_INCREMENT(a, stride, b, 7);
66
67  LOAD_INCREMENT(a_end, stride, b, 24);
68  LOAD_INCREMENT(a_end, stride, b, 25);
69  LOAD_INCREMENT(a_end, stride, b, 26);
70  LOAD_INCREMENT(a_end, stride, b, 27);
71  LOAD_INCREMENT(a_end, stride, b, 28);
72  LOAD_INCREMENT(a_end, stride, b, 29);
73  LOAD_INCREMENT(a_end, stride, b, 30);
74  LOAD_INCREMENT(a_end, stride, b, 31);
75
76  ADD_S16(b, 0, 31, c, 0);
77  ADD_S16(b, 1, 30, c, 1);
78  ADD_S16(b, 2, 29, c, 2);
79  ADD_S16(b, 3, 28, c, 3);
80  ADD_S16(b, 4, 27, c, 4);
81  ADD_S16(b, 5, 26, c, 5);
82  ADD_S16(b, 6, 25, c, 6);
83  ADD_S16(b, 7, 24, c, 7);
84
85  ADD_SHIFT_S16(b, 7, 24);
86  ADD_SHIFT_S16(b, 6, 25);
87  ADD_SHIFT_S16(b, 5, 26);
88  ADD_SHIFT_S16(b, 4, 27);
89  ADD_SHIFT_S16(b, 3, 28);
90  ADD_SHIFT_S16(b, 2, 29);
91  ADD_SHIFT_S16(b, 1, 30);
92  ADD_SHIFT_S16(b, 0, 31);
93
94  b[0] = vshlq_n_s16(c[0], 2);
95  b[1] = vshlq_n_s16(c[1], 2);
96  b[2] = vshlq_n_s16(c[2], 2);
97  b[3] = vshlq_n_s16(c[3], 2);
98  b[4] = vshlq_n_s16(c[4], 2);
99  b[5] = vshlq_n_s16(c[5], 2);
100  b[6] = vshlq_n_s16(c[6], 2);
101  b[7] = vshlq_n_s16(c[7], 2);
102
103  LOAD_INCREMENT(a, stride, b, 8);
104  LOAD_INCREMENT(a, stride, b, 9);
105  LOAD_INCREMENT(a, stride, b, 10);
106  LOAD_INCREMENT(a, stride, b, 11);
107  LOAD_INCREMENT(a, stride, b, 12);
108  LOAD_INCREMENT(a, stride, b, 13);
109  LOAD_INCREMENT(a, stride, b, 14);
110  LOAD_INCREMENT(a, stride, b, 15);
111  LOAD_INCREMENT(a, stride, b, 16);
112  LOAD_INCREMENT(a, stride, b, 17);
113  LOAD_INCREMENT(a, stride, b, 18);
114  LOAD_INCREMENT(a, stride, b, 19);
115  LOAD_INCREMENT(a, stride, b, 20);
116  LOAD_INCREMENT(a, stride, b, 21);
117  LOAD_INCREMENT(a, stride, b, 22);
118  LOAD_INCREMENT(a, stride, b, 23);
119
120  ADD_S16(b, 8, 23, c, 0);
121  ADD_S16(b, 9, 22, c, 1);
122  ADD_S16(b, 10, 21, c, 2);
123  ADD_S16(b, 11, 20, c, 3);
124  ADD_S16(b, 12, 19, c, 4);
125  ADD_S16(b, 13, 18, c, 5);
126  ADD_S16(b, 14, 17, c, 6);
127  ADD_S16(b, 15, 16, c, 7);
128
129  ADD_SHIFT_S16(b, 15, 16);
130  ADD_SHIFT_S16(b, 14, 17);
131  ADD_SHIFT_S16(b, 13, 18);
132  ADD_SHIFT_S16(b, 12, 19);
133  ADD_SHIFT_S16(b, 11, 20);
134  ADD_SHIFT_S16(b, 10, 21);
135  ADD_SHIFT_S16(b, 9, 22);
136  ADD_SHIFT_S16(b, 8, 23);
137
138  b[8] = vshlq_n_s16(c[0], 2);
139  b[9] = vshlq_n_s16(c[1], 2);
140  b[10] = vshlq_n_s16(c[2], 2);
141  b[11] = vshlq_n_s16(c[3], 2);
142  b[12] = vshlq_n_s16(c[4], 2);
143  b[13] = vshlq_n_s16(c[5], 2);
144  b[14] = vshlq_n_s16(c[6], 2);
145  b[15] = vshlq_n_s16(c[7], 2);
146}
147
148#undef LOAD_INCREMENT
149#undef ADD_S16
150#undef ADD_SHIFT_S16
151
152#define STORE_S16(src, index, dest)           \
153  do {                                        \
154    store_s16q_to_tran_low(dest, src[index]); \
155    dest += 8;                                \
156  } while (0);
157
158// Store 32 16x8 values, assuming stride == 32.
159// Slight twist: store horizontally in blocks of 8.
160static INLINE void store(tran_low_t *a, const int16x8_t *b) {
161  STORE_S16(b, 0, a);
162  STORE_S16(b, 8, a);
163  STORE_S16(b, 16, a);
164  STORE_S16(b, 24, a);
165  STORE_S16(b, 1, a);
166  STORE_S16(b, 9, a);
167  STORE_S16(b, 17, a);
168  STORE_S16(b, 25, a);
169  STORE_S16(b, 2, a);
170  STORE_S16(b, 10, a);
171  STORE_S16(b, 18, a);
172  STORE_S16(b, 26, a);
173  STORE_S16(b, 3, a);
174  STORE_S16(b, 11, a);
175  STORE_S16(b, 19, a);
176  STORE_S16(b, 27, a);
177  STORE_S16(b, 4, a);
178  STORE_S16(b, 12, a);
179  STORE_S16(b, 20, a);
180  STORE_S16(b, 28, a);
181  STORE_S16(b, 5, a);
182  STORE_S16(b, 13, a);
183  STORE_S16(b, 21, a);
184  STORE_S16(b, 29, a);
185  STORE_S16(b, 6, a);
186  STORE_S16(b, 14, a);
187  STORE_S16(b, 22, a);
188  STORE_S16(b, 30, a);
189  STORE_S16(b, 7, a);
190  STORE_S16(b, 15, a);
191  STORE_S16(b, 23, a);
192  STORE_S16(b, 31, a);
193}
194
195#undef STORE_S16
196
197// fdct_round_shift((a +/- b) * c)
198static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
199                                       const tran_high_t constant,
200                                       int16x8_t *add, int16x8_t *sub) {
201  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
202  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
203  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
204  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
205  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
206  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
207  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
208  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
209  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
210  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
211  *add = vcombine_s16(rounded0, rounded1);
212  *sub = vcombine_s16(rounded2, rounded3);
213}
214
215// fdct_round_shift(a * c0 +/- b * c1)
216static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
217                                       const tran_coef_t constant0,
218                                       const tran_coef_t constant1,
219                                       int16x8_t *add, int16x8_t *sub) {
220  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
221  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
222  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
223  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
224  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
225  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
226  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
227  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
228  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
229  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
230  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
231  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
232  *add = vcombine_s16(rounded0, rounded1);
233  *sub = vcombine_s16(rounded2, rounded3);
234}
235
236// Add 2 if positive, 1 if negative, and shift by 2.
237// In practice, subtract the sign bit, then shift with rounding.
238static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
239  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
240  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
241  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
242  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
243}
244
245static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
246  int16x8_t a[32];
247  int16x8_t b[32];
248
249  // Stage 1: Done as part of the load.
250
251  // Stage 2.
252  // Mini cross. X the first 16 values and the middle 8 of the second half.
253  a[0] = vaddq_s16(in[0], in[15]);
254  a[1] = vaddq_s16(in[1], in[14]);
255  a[2] = vaddq_s16(in[2], in[13]);
256  a[3] = vaddq_s16(in[3], in[12]);
257  a[4] = vaddq_s16(in[4], in[11]);
258  a[5] = vaddq_s16(in[5], in[10]);
259  a[6] = vaddq_s16(in[6], in[9]);
260  a[7] = vaddq_s16(in[7], in[8]);
261
262  a[8] = vsubq_s16(in[7], in[8]);
263  a[9] = vsubq_s16(in[6], in[9]);
264  a[10] = vsubq_s16(in[5], in[10]);
265  a[11] = vsubq_s16(in[4], in[11]);
266  a[12] = vsubq_s16(in[3], in[12]);
267  a[13] = vsubq_s16(in[2], in[13]);
268  a[14] = vsubq_s16(in[1], in[14]);
269  a[15] = vsubq_s16(in[0], in[15]);
270
271  a[16] = in[16];
272  a[17] = in[17];
273  a[18] = in[18];
274  a[19] = in[19];
275
276  butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
277  butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
278  butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
279  butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
280
281  a[28] = in[28];
282  a[29] = in[29];
283  a[30] = in[30];
284  a[31] = in[31];
285
286  // Stage 3.
287  b[0] = vaddq_s16(a[0], a[7]);
288  b[1] = vaddq_s16(a[1], a[6]);
289  b[2] = vaddq_s16(a[2], a[5]);
290  b[3] = vaddq_s16(a[3], a[4]);
291
292  b[4] = vsubq_s16(a[3], a[4]);
293  b[5] = vsubq_s16(a[2], a[5]);
294  b[6] = vsubq_s16(a[1], a[6]);
295  b[7] = vsubq_s16(a[0], a[7]);
296
297  b[8] = a[8];
298  b[9] = a[9];
299
300  butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
301  butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
302
303  b[14] = a[14];
304  b[15] = a[15];
305
306  b[16] = vaddq_s16(in[16], a[23]);
307  b[17] = vaddq_s16(in[17], a[22]);
308  b[18] = vaddq_s16(in[18], a[21]);
309  b[19] = vaddq_s16(in[19], a[20]);
310
311  b[20] = vsubq_s16(in[19], a[20]);
312  b[21] = vsubq_s16(in[18], a[21]);
313  b[22] = vsubq_s16(in[17], a[22]);
314  b[23] = vsubq_s16(in[16], a[23]);
315
316  b[24] = vsubq_s16(in[31], a[24]);
317  b[25] = vsubq_s16(in[30], a[25]);
318  b[26] = vsubq_s16(in[29], a[26]);
319  b[27] = vsubq_s16(in[28], a[27]);
320
321  b[28] = vaddq_s16(in[28], a[27]);
322  b[29] = vaddq_s16(in[29], a[26]);
323  b[30] = vaddq_s16(in[30], a[25]);
324  b[31] = vaddq_s16(in[31], a[24]);
325
326  // Stage 4.
327  a[0] = vaddq_s16(b[0], b[3]);
328  a[1] = vaddq_s16(b[1], b[2]);
329  a[2] = vsubq_s16(b[1], b[2]);
330  a[3] = vsubq_s16(b[0], b[3]);
331
332  a[4] = b[4];
333
334  butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
335
336  a[7] = b[7];
337
338  a[8] = vaddq_s16(b[8], b[11]);
339  a[9] = vaddq_s16(b[9], b[10]);
340  a[10] = vsubq_s16(b[9], b[10]);
341  a[11] = vsubq_s16(b[8], b[11]);
342  a[12] = vsubq_s16(b[15], b[12]);
343  a[13] = vsubq_s16(b[14], b[13]);
344  a[14] = vaddq_s16(b[14], b[13]);
345  a[15] = vaddq_s16(b[15], b[12]);
346
347  a[16] = b[16];
348  a[17] = b[17];
349
350  butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
351  butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
352  butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
353  butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
354
355  a[22] = b[22];
356  a[23] = b[23];
357  a[24] = b[24];
358  a[25] = b[25];
359
360  a[30] = b[30];
361  a[31] = b[31];
362
363  // Stage 5.
364  butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
365  butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
366
367  b[4] = vaddq_s16(a[4], a[5]);
368  b[5] = vsubq_s16(a[4], a[5]);
369  b[6] = vsubq_s16(a[7], a[6]);
370  b[7] = vaddq_s16(a[7], a[6]);
371
372  b[8] = a[8];
373
374  butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
375  butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
376
377  b[11] = a[11];
378  b[12] = a[12];
379
380  b[15] = a[15];
381
382  b[16] = vaddq_s16(a[19], a[16]);
383  b[17] = vaddq_s16(a[18], a[17]);
384  b[18] = vsubq_s16(a[17], a[18]);
385  b[19] = vsubq_s16(a[16], a[19]);
386  b[20] = vsubq_s16(a[23], a[20]);
387  b[21] = vsubq_s16(a[22], a[21]);
388  b[22] = vaddq_s16(a[21], a[22]);
389  b[23] = vaddq_s16(a[20], a[23]);
390  b[24] = vaddq_s16(a[27], a[24]);
391  b[25] = vaddq_s16(a[26], a[25]);
392  b[26] = vsubq_s16(a[25], a[26]);
393  b[27] = vsubq_s16(a[24], a[27]);
394  b[28] = vsubq_s16(a[31], a[28]);
395  b[29] = vsubq_s16(a[30], a[29]);
396  b[30] = vaddq_s16(a[29], a[30]);
397  b[31] = vaddq_s16(a[28], a[31]);
398
399  // Stage 6.
400  a[0] = b[0];
401  a[1] = b[1];
402  a[2] = b[2];
403  a[3] = b[3];
404
405  butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
406  butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
407
408  a[8] = vaddq_s16(b[8], b[9]);
409  a[9] = vsubq_s16(b[8], b[9]);
410  a[10] = vsubq_s16(b[11], b[10]);
411  a[11] = vaddq_s16(b[11], b[10]);
412  a[12] = vaddq_s16(b[12], b[13]);
413  a[13] = vsubq_s16(b[12], b[13]);
414  a[14] = vsubq_s16(b[15], b[14]);
415  a[15] = vaddq_s16(b[15], b[14]);
416
417  a[16] = b[16];
418  a[19] = b[19];
419  a[20] = b[20];
420  a[23] = b[23];
421  a[24] = b[24];
422  a[27] = b[27];
423  a[28] = b[28];
424  a[31] = b[31];
425
426  butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
427  butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
428
429  butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
430  butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
431
432  // Stage 7.
433  b[0] = a[0];
434  b[1] = a[1];
435  b[2] = a[2];
436  b[3] = a[3];
437  b[4] = a[4];
438  b[5] = a[5];
439  b[6] = a[6];
440  b[7] = a[7];
441
442  butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
443  butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
444  butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
445  butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
446
447  b[16] = vaddq_s16(a[16], a[17]);
448  b[17] = vsubq_s16(a[16], a[17]);
449  b[18] = vsubq_s16(a[19], a[18]);
450  b[19] = vaddq_s16(a[19], a[18]);
451  b[20] = vaddq_s16(a[20], a[21]);
452  b[21] = vsubq_s16(a[20], a[21]);
453  b[22] = vsubq_s16(a[23], a[22]);
454  b[23] = vaddq_s16(a[23], a[22]);
455  b[24] = vaddq_s16(a[24], a[25]);
456  b[25] = vsubq_s16(a[24], a[25]);
457  b[26] = vsubq_s16(a[27], a[26]);
458  b[27] = vaddq_s16(a[27], a[26]);
459  b[28] = vaddq_s16(a[28], a[29]);
460  b[29] = vsubq_s16(a[28], a[29]);
461  b[30] = vsubq_s16(a[31], a[30]);
462  b[31] = vaddq_s16(a[31], a[30]);
463
464  // Final stage.
465  // Also compute partial rounding shift:
466  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
467  out[0] = sub_round_shift(b[0]);
468  out[16] = sub_round_shift(b[1]);
469  out[8] = sub_round_shift(b[2]);
470  out[24] = sub_round_shift(b[3]);
471  out[4] = sub_round_shift(b[4]);
472  out[20] = sub_round_shift(b[5]);
473  out[12] = sub_round_shift(b[6]);
474  out[28] = sub_round_shift(b[7]);
475  out[2] = sub_round_shift(b[8]);
476  out[18] = sub_round_shift(b[9]);
477  out[10] = sub_round_shift(b[10]);
478  out[26] = sub_round_shift(b[11]);
479  out[6] = sub_round_shift(b[12]);
480  out[22] = sub_round_shift(b[13]);
481  out[14] = sub_round_shift(b[14]);
482  out[30] = sub_round_shift(b[15]);
483
484  butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
485  out[1] = sub_round_shift(a[1]);
486  out[31] = sub_round_shift(a[31]);
487
488  butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
489  out[17] = sub_round_shift(a[17]);
490  out[15] = sub_round_shift(a[15]);
491
492  butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
493  out[9] = sub_round_shift(a[9]);
494  out[23] = sub_round_shift(a[23]);
495
496  butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
497  out[25] = sub_round_shift(a[25]);
498  out[7] = sub_round_shift(a[7]);
499
500  butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
501  out[5] = sub_round_shift(a[5]);
502  out[27] = sub_round_shift(a[27]);
503
504  butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
505  out[21] = sub_round_shift(a[21]);
506  out[11] = sub_round_shift(a[11]);
507
508  butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
509  out[13] = sub_round_shift(a[13]);
510  out[19] = sub_round_shift(a[19]);
511
512  butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
513  out[29] = sub_round_shift(a[29]);
514  out[3] = sub_round_shift(a[3]);
515}
516
517#define PASS_THROUGH(src, dst, element)    \
518  do {                                     \
519    dst##_lo[element] = src##_lo[element]; \
520    dst##_hi[element] = src##_hi[element]; \
521  } while (0)
522
523#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
524  do {                                                                        \
525    b##_lo[b_index] =                                                         \
526        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
527    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
528                                vget_high_s16(a[right_index]));               \
529  } while (0)
530
531#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
532  do {                                                                        \
533    b##_lo[b_index] =                                                         \
534        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
535    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
536                                vget_high_s16(a[right_index]));               \
537  } while (0)
538
539#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
540  do {                                                                       \
541    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
542    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
543  } while (0)
544
545#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
546  do {                                                                     \
547    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
548    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
549    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
550    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
551  } while (0)
552
553#define ADD_S32(a, left_index, right_index, b, b_index)                   \
554  do {                                                                    \
555    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
556    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
557  } while (0)
558
559#define SUB_S32(a, left_index, right_index, b, b_index)                   \
560  do {                                                                    \
561    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
562    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
563  } while (0)
564
565// Like butterfly_one_coeff, but don't narrow results.
566static INLINE void butterfly_one_coeff_s16_s32(
567    const int16x8_t a, const int16x8_t b, const tran_high_t constant,
568    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
569    int32x4_t *sub_hi) {
570  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
571  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
572  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
573  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
574  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
575  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
576  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
577  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
578  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
579  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
580}
581
582#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
583                              add_index, sub_index)                      \
584  do {                                                                   \
585    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
586                                &b##_lo[add_index], &b##_hi[add_index],  \
587                                &b##_lo[sub_index], &b##_hi[sub_index]); \
588  } while (0)
589
590// Like butterfly_one_coeff, but with s32.
591static INLINE void butterfly_one_coeff_s32(
592    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
593    const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
594    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
595  const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
596  const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
597  const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
598  const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
599  const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
600  const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
601  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
602  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
603  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
604  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
605}
606
607#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
608                          sub_index)                                          \
609  do {                                                                        \
610    butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index],           \
611                            a##_lo[right_index], a##_hi[right_index],         \
612                            constant, &b##_lo[add_index], &b##_hi[add_index], \
613                            &b##_lo[sub_index], &b##_hi[sub_index]);          \
614  } while (0)
615
616// Like butterfly_two_coeff, but with s32.
617static INLINE void butterfly_two_coeff_s32(
618    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
619    const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
620    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
621    int32x4_t *sub_hi) {
622  const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
623  const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
624  const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
625  const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
626  const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
627  const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
628  const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
629  const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
630  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
631  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
632  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
633  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
634}
635
636#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
637                          right_constant, b, add_index, sub_index)             \
638  do {                                                                         \
639    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
640                            a##_lo[right_index], a##_hi[right_index],          \
641                            left_constant, right_constant, &b##_lo[add_index], \
642                            &b##_hi[add_index], &b##_lo[sub_index],            \
643                            &b##_hi[sub_index]);                               \
644  } while (0)
645
646// Add 1 if positive, 2 if negative, and shift by 2.
647// In practice, add 1, then add the sign bit, then shift without rounding.
648static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
649                                            const int32x4_t a_hi) {
650  const int32x4_t one = vdupq_n_s32(1);
651  const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
652  const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
653  const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
654  const int16x4_t b_lo =
655      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
656  const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
657  const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
658  const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
659  const int16x4_t b_hi =
660      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
661  return vcombine_s16(b_lo, b_hi);
662}
663
664static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
665  int16x8_t a[32];
666  int16x8_t b[32];
667  int32x4_t c_lo[32];
668  int32x4_t c_hi[32];
669  int32x4_t d_lo[32];
670  int32x4_t d_hi[32];
671
672  // Stage 1. Done as part of the load for the first pass.
673  a[0] = vaddq_s16(in[0], in[31]);
674  a[1] = vaddq_s16(in[1], in[30]);
675  a[2] = vaddq_s16(in[2], in[29]);
676  a[3] = vaddq_s16(in[3], in[28]);
677  a[4] = vaddq_s16(in[4], in[27]);
678  a[5] = vaddq_s16(in[5], in[26]);
679  a[6] = vaddq_s16(in[6], in[25]);
680  a[7] = vaddq_s16(in[7], in[24]);
681  a[8] = vaddq_s16(in[8], in[23]);
682  a[9] = vaddq_s16(in[9], in[22]);
683  a[10] = vaddq_s16(in[10], in[21]);
684  a[11] = vaddq_s16(in[11], in[20]);
685  a[12] = vaddq_s16(in[12], in[19]);
686  a[13] = vaddq_s16(in[13], in[18]);
687  a[14] = vaddq_s16(in[14], in[17]);
688  a[15] = vaddq_s16(in[15], in[16]);
689  a[16] = vsubq_s16(in[15], in[16]);
690  a[17] = vsubq_s16(in[14], in[17]);
691  a[18] = vsubq_s16(in[13], in[18]);
692  a[19] = vsubq_s16(in[12], in[19]);
693  a[20] = vsubq_s16(in[11], in[20]);
694  a[21] = vsubq_s16(in[10], in[21]);
695  a[22] = vsubq_s16(in[9], in[22]);
696  a[23] = vsubq_s16(in[8], in[23]);
697  a[24] = vsubq_s16(in[7], in[24]);
698  a[25] = vsubq_s16(in[6], in[25]);
699  a[26] = vsubq_s16(in[5], in[26]);
700  a[27] = vsubq_s16(in[4], in[27]);
701  a[28] = vsubq_s16(in[3], in[28]);
702  a[29] = vsubq_s16(in[2], in[29]);
703  a[30] = vsubq_s16(in[1], in[30]);
704  a[31] = vsubq_s16(in[0], in[31]);
705
706  // Stage 2.
707  b[0] = vaddq_s16(a[0], a[15]);
708  b[1] = vaddq_s16(a[1], a[14]);
709  b[2] = vaddq_s16(a[2], a[13]);
710  b[3] = vaddq_s16(a[3], a[12]);
711  b[4] = vaddq_s16(a[4], a[11]);
712  b[5] = vaddq_s16(a[5], a[10]);
713  b[6] = vaddq_s16(a[6], a[9]);
714  b[7] = vaddq_s16(a[7], a[8]);
715
716  b[8] = vsubq_s16(a[7], a[8]);
717  b[9] = vsubq_s16(a[6], a[9]);
718  b[10] = vsubq_s16(a[5], a[10]);
719  b[11] = vsubq_s16(a[4], a[11]);
720  b[12] = vsubq_s16(a[3], a[12]);
721  b[13] = vsubq_s16(a[2], a[13]);
722  b[14] = vsubq_s16(a[1], a[14]);
723  b[15] = vsubq_s16(a[0], a[15]);
724
725  b[16] = a[16];
726  b[17] = a[17];
727  b[18] = a[18];
728  b[19] = a[19];
729
730  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
731  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
732  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
733  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
734
735  b[28] = a[28];
736  b[29] = a[29];
737  b[30] = a[30];
738  b[31] = a[31];
739
740  // Stage 3. With extreme values for input this calculation rolls over int16_t.
741  // The sources for b[0] get added multiple times and, through testing, have
742  // been shown to overflow starting here.
743  ADD_S16_S32(b, 0, 7, c, 0);
744  ADD_S16_S32(b, 1, 6, c, 1);
745  ADD_S16_S32(b, 2, 5, c, 2);
746  ADD_S16_S32(b, 3, 4, c, 3);
747  SUB_S16_S32(b, 3, 4, c, 4);
748  SUB_S16_S32(b, 2, 5, c, 5);
749  SUB_S16_S32(b, 1, 6, c, 6);
750  SUB_S16_S32(b, 0, 7, c, 7);
751
752  a[8] = b[8];
753  a[9] = b[9];
754
755  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
756  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
757
758  a[14] = b[14];
759  a[15] = b[15];
760
761  ADD_S16_S32(b, 16, 23, c, 16);
762  ADD_S16_S32(b, 17, 22, c, 17);
763  ADD_S16_S32(b, 18, 21, c, 18);
764  ADD_S16_S32(b, 19, 20, c, 19);
765  SUB_S16_S32(b, 19, 20, c, 20);
766  SUB_S16_S32(b, 18, 21, c, 21);
767  SUB_S16_S32(b, 17, 22, c, 22);
768  SUB_S16_S32(b, 16, 23, c, 23);
769  SUB_S16_S32(b, 31, 24, c, 24);
770  SUB_S16_S32(b, 30, 25, c, 25);
771  SUB_S16_S32(b, 29, 26, c, 26);
772  SUB_S16_S32(b, 28, 27, c, 27);
773  ADD_S16_S32(b, 28, 27, c, 28);
774  ADD_S16_S32(b, 29, 26, c, 29);
775  ADD_S16_S32(b, 30, 25, c, 30);
776  ADD_S16_S32(b, 31, 24, c, 31);
777
778  // Stage 4.
779  ADD_S32(c, 0, 3, d, 0);
780  ADD_S32(c, 1, 2, d, 1);
781  SUB_S32(c, 1, 2, d, 2);
782  SUB_S32(c, 0, 3, d, 3);
783
784  PASS_THROUGH(c, d, 4);
785
786  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
787
788  PASS_THROUGH(c, d, 7);
789
790  ADDW_S16_S32(c, 11, a, 8, d, 8);
791  ADDW_S16_S32(c, 10, a, 9, d, 9);
792  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
793  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
794  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
795  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
796  ADDW_S16_S32(c, 13, b, 14, d, 14);
797  ADDW_S16_S32(c, 12, b, 15, d, 15);
798
799  PASS_THROUGH(c, d, 16);
800  PASS_THROUGH(c, d, 17);
801
802  BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
803  BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
804  BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
805  BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
806
807  PASS_THROUGH(c, d, 22);
808  PASS_THROUGH(c, d, 23);
809  PASS_THROUGH(c, d, 24);
810  PASS_THROUGH(c, d, 25);
811
812  PASS_THROUGH(c, d, 30);
813  PASS_THROUGH(c, d, 31);
814
815  // Stage 5.
816  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
817  BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
818
819  ADD_S32(d, 4, 5, c, 4);
820  SUB_S32(d, 4, 5, c, 5);
821  SUB_S32(d, 7, 6, c, 6);
822  ADD_S32(d, 7, 6, c, 7);
823
824  PASS_THROUGH(d, c, 8);
825
826  BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
827  BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
828
829  PASS_THROUGH(d, c, 11);
830  PASS_THROUGH(d, c, 12);
831  PASS_THROUGH(d, c, 15);
832
833  ADD_S32(d, 16, 19, c, 16);
834  ADD_S32(d, 17, 18, c, 17);
835  SUB_S32(d, 17, 18, c, 18);
836  SUB_S32(d, 16, 19, c, 19);
837  SUB_S32(d, 23, 20, c, 20);
838  SUB_S32(d, 22, 21, c, 21);
839  ADD_S32(d, 22, 21, c, 22);
840  ADD_S32(d, 23, 20, c, 23);
841  ADD_S32(d, 24, 27, c, 24);
842  ADD_S32(d, 25, 26, c, 25);
843  SUB_S32(d, 25, 26, c, 26);
844  SUB_S32(d, 24, 27, c, 27);
845  SUB_S32(d, 31, 28, c, 28);
846  SUB_S32(d, 30, 29, c, 29);
847  ADD_S32(d, 30, 29, c, 30);
848  ADD_S32(d, 31, 28, c, 31);
849
850  // Stage 6.
851  PASS_THROUGH(c, d, 0);
852  PASS_THROUGH(c, d, 1);
853  PASS_THROUGH(c, d, 2);
854  PASS_THROUGH(c, d, 3);
855
856  BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
857  BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
858
859  ADD_S32(c, 8, 9, d, 8);
860  SUB_S32(c, 8, 9, d, 9);
861  SUB_S32(c, 11, 10, d, 10);
862  ADD_S32(c, 11, 10, d, 11);
863  ADD_S32(c, 12, 13, d, 12);
864  SUB_S32(c, 12, 13, d, 13);
865  SUB_S32(c, 15, 14, d, 14);
866  ADD_S32(c, 15, 14, d, 15);
867
868  PASS_THROUGH(c, d, 16);
869  PASS_THROUGH(c, d, 19);
870  PASS_THROUGH(c, d, 20);
871  PASS_THROUGH(c, d, 23);
872  PASS_THROUGH(c, d, 24);
873  PASS_THROUGH(c, d, 27);
874  PASS_THROUGH(c, d, 28);
875  PASS_THROUGH(c, d, 31);
876
877  BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
878  BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
879  BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
880  BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
881
882  // Stage 7.
883  PASS_THROUGH(d, c, 0);
884  PASS_THROUGH(d, c, 1);
885  PASS_THROUGH(d, c, 2);
886  PASS_THROUGH(d, c, 3);
887  PASS_THROUGH(d, c, 4);
888  PASS_THROUGH(d, c, 5);
889  PASS_THROUGH(d, c, 6);
890  PASS_THROUGH(d, c, 7);
891
892  BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
893  BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
894  BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
895  BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
896
897  ADD_S32(d, 16, 17, c, 16);
898  SUB_S32(d, 16, 17, c, 17);
899  SUB_S32(d, 19, 18, c, 18);
900  ADD_S32(d, 19, 18, c, 19);
901  ADD_S32(d, 20, 21, c, 20);
902  SUB_S32(d, 20, 21, c, 21);
903  SUB_S32(d, 23, 22, c, 22);
904  ADD_S32(d, 23, 22, c, 23);
905  ADD_S32(d, 24, 25, c, 24);
906  SUB_S32(d, 24, 25, c, 25);
907  SUB_S32(d, 27, 26, c, 26);
908  ADD_S32(d, 27, 26, c, 27);
909  ADD_S32(d, 28, 29, c, 28);
910  SUB_S32(d, 28, 29, c, 29);
911  SUB_S32(d, 31, 30, c, 30);
912  ADD_S32(d, 31, 30, c, 31);
913
914  // Final stage.
915  // Roll rounding into this function so we can pass back int16x8.
916
917  out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
918  out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
919
920  out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
921  out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
922  out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
923  out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
924  out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
925
926  out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
927  out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
928  out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
929  out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
930
931  out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
932  out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
933  out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
934  out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
935  out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
936
937  BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
938  out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
939  out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
940
941  BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
942  out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
943  out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
944
945  BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
946  out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
947  out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
948
949  BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
950  out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
951  out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
952
953  BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
954  out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
955  out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
956
957  BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
958  out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
959  out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
960
961  BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
962  out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
963  out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
964
965  BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
966  out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
967  out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
968}
969
970// Add 1 if positive, 2 if negative, and shift by 2.
971// In practice, add 1, then add the sign bit, then shift without rounding.
972static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
973  const int16x8_t one = vdupq_n_s16(1);
974  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
975  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
976  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
977  return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
978}
979
980static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
981  int16x8_t a[32];
982  int16x8_t b[32];
983
984  // Stage 1. Done as part of the load for the first pass.
985  a[0] = vaddq_s16(in[0], in[31]);
986  a[1] = vaddq_s16(in[1], in[30]);
987  a[2] = vaddq_s16(in[2], in[29]);
988  a[3] = vaddq_s16(in[3], in[28]);
989  a[4] = vaddq_s16(in[4], in[27]);
990  a[5] = vaddq_s16(in[5], in[26]);
991  a[6] = vaddq_s16(in[6], in[25]);
992  a[7] = vaddq_s16(in[7], in[24]);
993  a[8] = vaddq_s16(in[8], in[23]);
994  a[9] = vaddq_s16(in[9], in[22]);
995  a[10] = vaddq_s16(in[10], in[21]);
996  a[11] = vaddq_s16(in[11], in[20]);
997  a[12] = vaddq_s16(in[12], in[19]);
998  a[13] = vaddq_s16(in[13], in[18]);
999  a[14] = vaddq_s16(in[14], in[17]);
1000  a[15] = vaddq_s16(in[15], in[16]);
1001  a[16] = vsubq_s16(in[15], in[16]);
1002  a[17] = vsubq_s16(in[14], in[17]);
1003  a[18] = vsubq_s16(in[13], in[18]);
1004  a[19] = vsubq_s16(in[12], in[19]);
1005  a[20] = vsubq_s16(in[11], in[20]);
1006  a[21] = vsubq_s16(in[10], in[21]);
1007  a[22] = vsubq_s16(in[9], in[22]);
1008  a[23] = vsubq_s16(in[8], in[23]);
1009  a[24] = vsubq_s16(in[7], in[24]);
1010  a[25] = vsubq_s16(in[6], in[25]);
1011  a[26] = vsubq_s16(in[5], in[26]);
1012  a[27] = vsubq_s16(in[4], in[27]);
1013  a[28] = vsubq_s16(in[3], in[28]);
1014  a[29] = vsubq_s16(in[2], in[29]);
1015  a[30] = vsubq_s16(in[1], in[30]);
1016  a[31] = vsubq_s16(in[0], in[31]);
1017
1018  // Stage 2.
1019  // For the "rd" version, all the values are rounded down after stage 2 to keep
1020  // the values in 16 bits.
1021  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
1022  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
1023  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
1024  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
1025  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
1026  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
1027  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
1028  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
1029
1030  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
1031  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
1032  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
1033  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
1034  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
1035  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
1036  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
1037  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
1038
1039  b[16] = add_round_shift_s16(a[16]);
1040  b[17] = add_round_shift_s16(a[17]);
1041  b[18] = add_round_shift_s16(a[18]);
1042  b[19] = add_round_shift_s16(a[19]);
1043
1044  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
1045  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
1046  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
1047  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
1048  b[20] = add_round_shift_s16(b[20]);
1049  b[21] = add_round_shift_s16(b[21]);
1050  b[22] = add_round_shift_s16(b[22]);
1051  b[23] = add_round_shift_s16(b[23]);
1052  b[24] = add_round_shift_s16(b[24]);
1053  b[25] = add_round_shift_s16(b[25]);
1054  b[26] = add_round_shift_s16(b[26]);
1055  b[27] = add_round_shift_s16(b[27]);
1056
1057  b[28] = add_round_shift_s16(a[28]);
1058  b[29] = add_round_shift_s16(a[29]);
1059  b[30] = add_round_shift_s16(a[30]);
1060  b[31] = add_round_shift_s16(a[31]);
1061
1062  // Stage 3.
1063  a[0] = vaddq_s16(b[0], b[7]);
1064  a[1] = vaddq_s16(b[1], b[6]);
1065  a[2] = vaddq_s16(b[2], b[5]);
1066  a[3] = vaddq_s16(b[3], b[4]);
1067
1068  a[4] = vsubq_s16(b[3], b[4]);
1069  a[5] = vsubq_s16(b[2], b[5]);
1070  a[6] = vsubq_s16(b[1], b[6]);
1071  a[7] = vsubq_s16(b[0], b[7]);
1072
1073  a[8] = b[8];
1074  a[9] = b[9];
1075
1076  butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
1077  butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
1078
1079  a[14] = b[14];
1080  a[15] = b[15];
1081
1082  a[16] = vaddq_s16(b[16], b[23]);
1083  a[17] = vaddq_s16(b[17], b[22]);
1084  a[18] = vaddq_s16(b[18], b[21]);
1085  a[19] = vaddq_s16(b[19], b[20]);
1086
1087  a[20] = vsubq_s16(b[19], b[20]);
1088  a[21] = vsubq_s16(b[18], b[21]);
1089  a[22] = vsubq_s16(b[17], b[22]);
1090  a[23] = vsubq_s16(b[16], b[23]);
1091
1092  a[24] = vsubq_s16(b[31], b[24]);
1093  a[25] = vsubq_s16(b[30], b[25]);
1094  a[26] = vsubq_s16(b[29], b[26]);
1095  a[27] = vsubq_s16(b[28], b[27]);
1096
1097  a[28] = vaddq_s16(b[28], b[27]);
1098  a[29] = vaddq_s16(b[29], b[26]);
1099  a[30] = vaddq_s16(b[30], b[25]);
1100  a[31] = vaddq_s16(b[31], b[24]);
1101
1102  // Stage 4.
1103  b[0] = vaddq_s16(a[0], a[3]);
1104  b[1] = vaddq_s16(a[1], a[2]);
1105  b[2] = vsubq_s16(a[1], a[2]);
1106  b[3] = vsubq_s16(a[0], a[3]);
1107
1108  b[4] = a[4];
1109
1110  butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
1111
1112  b[7] = a[7];
1113
1114  b[8] = vaddq_s16(a[8], a[11]);
1115  b[9] = vaddq_s16(a[9], a[10]);
1116  b[10] = vsubq_s16(a[9], a[10]);
1117  b[11] = vsubq_s16(a[8], a[11]);
1118  b[12] = vsubq_s16(a[15], a[12]);
1119  b[13] = vsubq_s16(a[14], a[13]);
1120  b[14] = vaddq_s16(a[14], a[13]);
1121  b[15] = vaddq_s16(a[15], a[12]);
1122
1123  b[16] = a[16];
1124  b[17] = a[17];
1125
1126  butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
1127  butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
1128  butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
1129  butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
1130
1131  b[22] = a[22];
1132  b[23] = a[23];
1133  b[24] = a[24];
1134  b[25] = a[25];
1135
1136  b[30] = a[30];
1137  b[31] = a[31];
1138
1139  // Stage 5.
1140  butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
1141  butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
1142
1143  a[4] = vaddq_s16(b[4], b[5]);
1144  a[5] = vsubq_s16(b[4], b[5]);
1145  a[6] = vsubq_s16(b[7], b[6]);
1146  a[7] = vaddq_s16(b[7], b[6]);
1147
1148  a[8] = b[8];
1149
1150  butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
1151  butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
1152
1153  a[11] = b[11];
1154  a[12] = b[12];
1155
1156  a[15] = b[15];
1157
1158  a[16] = vaddq_s16(b[19], b[16]);
1159  a[17] = vaddq_s16(b[18], b[17]);
1160  a[18] = vsubq_s16(b[17], b[18]);
1161  a[19] = vsubq_s16(b[16], b[19]);
1162  a[20] = vsubq_s16(b[23], b[20]);
1163  a[21] = vsubq_s16(b[22], b[21]);
1164  a[22] = vaddq_s16(b[21], b[22]);
1165  a[23] = vaddq_s16(b[20], b[23]);
1166  a[24] = vaddq_s16(b[27], b[24]);
1167  a[25] = vaddq_s16(b[26], b[25]);
1168  a[26] = vsubq_s16(b[25], b[26]);
1169  a[27] = vsubq_s16(b[24], b[27]);
1170  a[28] = vsubq_s16(b[31], b[28]);
1171  a[29] = vsubq_s16(b[30], b[29]);
1172  a[30] = vaddq_s16(b[29], b[30]);
1173  a[31] = vaddq_s16(b[28], b[31]);
1174
1175  // Stage 6.
1176  b[0] = a[0];
1177  b[1] = a[1];
1178  b[2] = a[2];
1179  b[3] = a[3];
1180
1181  butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
1182  butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
1183
1184  b[8] = vaddq_s16(a[8], a[9]);
1185  b[9] = vsubq_s16(a[8], a[9]);
1186  b[10] = vsubq_s16(a[11], a[10]);
1187  b[11] = vaddq_s16(a[11], a[10]);
1188  b[12] = vaddq_s16(a[12], a[13]);
1189  b[13] = vsubq_s16(a[12], a[13]);
1190  b[14] = vsubq_s16(a[15], a[14]);
1191  b[15] = vaddq_s16(a[15], a[14]);
1192
1193  b[16] = a[16];
1194  b[19] = a[19];
1195  b[20] = a[20];
1196  b[23] = a[23];
1197  b[24] = a[24];
1198  b[27] = a[27];
1199  b[28] = a[28];
1200  b[31] = a[31];
1201
1202  butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
1203  butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
1204
1205  butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
1206  butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
1207
1208  // Stage 7.
1209  a[0] = b[0];
1210  a[1] = b[1];
1211  a[2] = b[2];
1212  a[3] = b[3];
1213  a[4] = b[4];
1214  a[5] = b[5];
1215  a[6] = b[6];
1216  a[7] = b[7];
1217
1218  butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
1219  butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
1220  butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
1221  butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
1222
1223  a[16] = vaddq_s16(b[16], b[17]);
1224  a[17] = vsubq_s16(b[16], b[17]);
1225  a[18] = vsubq_s16(b[19], b[18]);
1226  a[19] = vaddq_s16(b[19], b[18]);
1227  a[20] = vaddq_s16(b[20], b[21]);
1228  a[21] = vsubq_s16(b[20], b[21]);
1229  a[22] = vsubq_s16(b[23], b[22]);
1230  a[23] = vaddq_s16(b[23], b[22]);
1231  a[24] = vaddq_s16(b[24], b[25]);
1232  a[25] = vsubq_s16(b[24], b[25]);
1233  a[26] = vsubq_s16(b[27], b[26]);
1234  a[27] = vaddq_s16(b[27], b[26]);
1235  a[28] = vaddq_s16(b[28], b[29]);
1236  a[29] = vsubq_s16(b[28], b[29]);
1237  a[30] = vsubq_s16(b[31], b[30]);
1238  a[31] = vaddq_s16(b[31], b[30]);
1239
1240  // Final stage.
1241  out[0] = a[0];
1242  out[16] = a[1];
1243  out[8] = a[2];
1244  out[24] = a[3];
1245  out[4] = a[4];
1246  out[20] = a[5];
1247  out[12] = a[6];
1248  out[28] = a[7];
1249  out[2] = a[8];
1250  out[18] = a[9];
1251  out[10] = a[10];
1252  out[26] = a[11];
1253  out[6] = a[12];
1254  out[22] = a[13];
1255  out[14] = a[14];
1256  out[30] = a[15];
1257
1258  butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
1259  butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
1260                      &out[15]);
1261  butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
1262  butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
1263  butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
1264  butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
1265                      &out[11]);
1266  butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
1267                      &out[19]);
1268  butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
1269}
1270
1271#undef PASS_THROUGH
1272#undef ADD_S16_S32
1273#undef SUB_S16_S32
1274#undef ADDW_S16_S32
1275#undef SUBW_S16_S32
1276#undef ADD_S32
1277#undef SUB_S32
1278#undef BUTTERFLY_ONE_S16_S32
1279#undef BUTTERFLY_ONE_S32
1280#undef BUTTERFLY_TWO_S32
1281
1282// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
1283// are all in-place.
1284// TODO(johannkoenig): share with other fdcts.
1285static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
1286  // Swap 16 bit elements.
1287  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
1288  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
1289  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
1290  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
1291
1292  // Swap 32 bit elements.
1293  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
1294                                   vreinterpretq_s32_s16(c1.val[0]));
1295  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
1296                                   vreinterpretq_s32_s16(c1.val[1]));
1297  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
1298                                   vreinterpretq_s32_s16(c3.val[0]));
1299  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
1300                                   vreinterpretq_s32_s16(c3.val[1]));
1301
1302  // Swap 64 bit elements
1303  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
1304  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
1305  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
1306  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
1307
1308  b[0] = e0.val[0];
1309  b[1] = e1.val[0];
1310  b[2] = e2.val[0];
1311  b[3] = e3.val[0];
1312  b[4] = e0.val[1];
1313  b[5] = e1.val[1];
1314  b[6] = e2.val[1];
1315  b[7] = e3.val[1];
1316}
1317
1318void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
1319  int16x8_t temp0[32];
1320  int16x8_t temp1[32];
1321  int16x8_t temp2[32];
1322  int16x8_t temp3[32];
1323  int16x8_t temp4[32];
1324  int16x8_t temp5[32];
1325
1326  // Process in 8x32 columns.
1327  load(input, stride, temp0);
1328  dct_body_first_pass(temp0, temp1);
1329
1330  load(input + 8, stride, temp0);
1331  dct_body_first_pass(temp0, temp2);
1332
1333  load(input + 16, stride, temp0);
1334  dct_body_first_pass(temp0, temp3);
1335
1336  load(input + 24, stride, temp0);
1337  dct_body_first_pass(temp0, temp4);
1338
1339  // Generate the top row by munging the first set of 8 from each one together.
1340  transpose_8x8(&temp1[0], &temp0[0]);
1341  transpose_8x8(&temp2[0], &temp0[8]);
1342  transpose_8x8(&temp3[0], &temp0[16]);
1343  transpose_8x8(&temp4[0], &temp0[24]);
1344
1345  dct_body_second_pass(temp0, temp5);
1346
1347  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1348                    &temp5[5], &temp5[6], &temp5[7]);
1349  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1350                    &temp5[13], &temp5[14], &temp5[15]);
1351  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1352                    &temp5[21], &temp5[22], &temp5[23]);
1353  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1354                    &temp5[29], &temp5[30], &temp5[31]);
1355  store(output, temp5);
1356
1357  // Second row of 8x32.
1358  transpose_8x8(&temp1[8], &temp0[0]);
1359  transpose_8x8(&temp2[8], &temp0[8]);
1360  transpose_8x8(&temp3[8], &temp0[16]);
1361  transpose_8x8(&temp4[8], &temp0[24]);
1362
1363  dct_body_second_pass(temp0, temp5);
1364
1365  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1366                    &temp5[5], &temp5[6], &temp5[7]);
1367  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1368                    &temp5[13], &temp5[14], &temp5[15]);
1369  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1370                    &temp5[21], &temp5[22], &temp5[23]);
1371  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1372                    &temp5[29], &temp5[30], &temp5[31]);
1373  store(output + 8 * 32, temp5);
1374
1375  // Third row of 8x32
1376  transpose_8x8(&temp1[16], &temp0[0]);
1377  transpose_8x8(&temp2[16], &temp0[8]);
1378  transpose_8x8(&temp3[16], &temp0[16]);
1379  transpose_8x8(&temp4[16], &temp0[24]);
1380
1381  dct_body_second_pass(temp0, temp5);
1382
1383  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1384                    &temp5[5], &temp5[6], &temp5[7]);
1385  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1386                    &temp5[13], &temp5[14], &temp5[15]);
1387  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1388                    &temp5[21], &temp5[22], &temp5[23]);
1389  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1390                    &temp5[29], &temp5[30], &temp5[31]);
1391  store(output + 16 * 32, temp5);
1392
1393  // Final row of 8x32.
1394  transpose_8x8(&temp1[24], &temp0[0]);
1395  transpose_8x8(&temp2[24], &temp0[8]);
1396  transpose_8x8(&temp3[24], &temp0[16]);
1397  transpose_8x8(&temp4[24], &temp0[24]);
1398
1399  dct_body_second_pass(temp0, temp5);
1400
1401  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1402                    &temp5[5], &temp5[6], &temp5[7]);
1403  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1404                    &temp5[13], &temp5[14], &temp5[15]);
1405  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1406                    &temp5[21], &temp5[22], &temp5[23]);
1407  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1408                    &temp5[29], &temp5[30], &temp5[31]);
1409  store(output + 24 * 32, temp5);
1410}
1411
1412void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
1413                           int stride) {
1414  int16x8_t temp0[32];
1415  int16x8_t temp1[32];
1416  int16x8_t temp2[32];
1417  int16x8_t temp3[32];
1418  int16x8_t temp4[32];
1419  int16x8_t temp5[32];
1420
1421  // Process in 8x32 columns.
1422  load(input, stride, temp0);
1423  dct_body_first_pass(temp0, temp1);
1424
1425  load(input + 8, stride, temp0);
1426  dct_body_first_pass(temp0, temp2);
1427
1428  load(input + 16, stride, temp0);
1429  dct_body_first_pass(temp0, temp3);
1430
1431  load(input + 24, stride, temp0);
1432  dct_body_first_pass(temp0, temp4);
1433
1434  // Generate the top row by munging the first set of 8 from each one together.
1435  transpose_8x8(&temp1[0], &temp0[0]);
1436  transpose_8x8(&temp2[0], &temp0[8]);
1437  transpose_8x8(&temp3[0], &temp0[16]);
1438  transpose_8x8(&temp4[0], &temp0[24]);
1439
1440  dct_body_second_pass_rd(temp0, temp5);
1441
1442  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1443                    &temp5[5], &temp5[6], &temp5[7]);
1444  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1445                    &temp5[13], &temp5[14], &temp5[15]);
1446  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1447                    &temp5[21], &temp5[22], &temp5[23]);
1448  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1449                    &temp5[29], &temp5[30], &temp5[31]);
1450  store(output, temp5);
1451
1452  // Second row of 8x32.
1453  transpose_8x8(&temp1[8], &temp0[0]);
1454  transpose_8x8(&temp2[8], &temp0[8]);
1455  transpose_8x8(&temp3[8], &temp0[16]);
1456  transpose_8x8(&temp4[8], &temp0[24]);
1457
1458  dct_body_second_pass_rd(temp0, temp5);
1459
1460  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1461                    &temp5[5], &temp5[6], &temp5[7]);
1462  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1463                    &temp5[13], &temp5[14], &temp5[15]);
1464  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1465                    &temp5[21], &temp5[22], &temp5[23]);
1466  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1467                    &temp5[29], &temp5[30], &temp5[31]);
1468  store(output + 8 * 32, temp5);
1469
1470  // Third row of 8x32
1471  transpose_8x8(&temp1[16], &temp0[0]);
1472  transpose_8x8(&temp2[16], &temp0[8]);
1473  transpose_8x8(&temp3[16], &temp0[16]);
1474  transpose_8x8(&temp4[16], &temp0[24]);
1475
1476  dct_body_second_pass_rd(temp0, temp5);
1477
1478  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1479                    &temp5[5], &temp5[6], &temp5[7]);
1480  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1481                    &temp5[13], &temp5[14], &temp5[15]);
1482  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1483                    &temp5[21], &temp5[22], &temp5[23]);
1484  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1485                    &temp5[29], &temp5[30], &temp5[31]);
1486  store(output + 16 * 32, temp5);
1487
1488  // Final row of 8x32.
1489  transpose_8x8(&temp1[24], &temp0[0]);
1490  transpose_8x8(&temp2[24], &temp0[8]);
1491  transpose_8x8(&temp3[24], &temp0[16]);
1492  transpose_8x8(&temp4[24], &temp0[24]);
1493
1494  dct_body_second_pass_rd(temp0, temp5);
1495
1496  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1497                    &temp5[5], &temp5[6], &temp5[7]);
1498  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1499                    &temp5[13], &temp5[14], &temp5[15]);
1500  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1501                    &temp5[21], &temp5[22], &temp5[23]);
1502  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1503                    &temp5[29], &temp5[30], &temp5[31]);
1504  store(output + 24 * 32, temp5);
1505}
1506#endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
1507        // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
1508