idct16x16_add_neon.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12
13#include "./vpx_config.h"
14#include "vpx_dsp/txfm_common.h"
15
16static INLINE void TRANSPOSE8X8(
17        int16x8_t *q8s16,
18        int16x8_t *q9s16,
19        int16x8_t *q10s16,
20        int16x8_t *q11s16,
21        int16x8_t *q12s16,
22        int16x8_t *q13s16,
23        int16x8_t *q14s16,
24        int16x8_t *q15s16) {
25    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
26    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
27    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
28    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
29
30    d16s16 = vget_low_s16(*q8s16);
31    d17s16 = vget_high_s16(*q8s16);
32    d18s16 = vget_low_s16(*q9s16);
33    d19s16 = vget_high_s16(*q9s16);
34    d20s16 = vget_low_s16(*q10s16);
35    d21s16 = vget_high_s16(*q10s16);
36    d22s16 = vget_low_s16(*q11s16);
37    d23s16 = vget_high_s16(*q11s16);
38    d24s16 = vget_low_s16(*q12s16);
39    d25s16 = vget_high_s16(*q12s16);
40    d26s16 = vget_low_s16(*q13s16);
41    d27s16 = vget_high_s16(*q13s16);
42    d28s16 = vget_low_s16(*q14s16);
43    d29s16 = vget_high_s16(*q14s16);
44    d30s16 = vget_low_s16(*q15s16);
45    d31s16 = vget_high_s16(*q15s16);
46
47    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
48    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
49    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
50    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
51    *q12s16 = vcombine_s16(d17s16, d25s16);
52    *q13s16 = vcombine_s16(d19s16, d27s16);
53    *q14s16 = vcombine_s16(d21s16, d29s16);
54    *q15s16 = vcombine_s16(d23s16, d31s16);
55
56    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
57                        vreinterpretq_s32_s16(*q10s16));
58    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
59                        vreinterpretq_s32_s16(*q11s16));
60    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
61                        vreinterpretq_s32_s16(*q14s16));
62    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
63                        vreinterpretq_s32_s16(*q15s16));
64
65    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
66                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
67    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
68                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
69    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
70                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
71    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
72                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
73
74    *q8s16  = q0x2s16.val[0];
75    *q9s16  = q0x2s16.val[1];
76    *q10s16 = q1x2s16.val[0];
77    *q11s16 = q1x2s16.val[1];
78    *q12s16 = q2x2s16.val[0];
79    *q13s16 = q2x2s16.val[1];
80    *q14s16 = q3x2s16.val[0];
81    *q15s16 = q3x2s16.val[1];
82    return;
83}
84
85void vpx_idct16x16_256_add_neon_pass1(
86        int16_t *in,
87        int16_t *out,
88        int output_stride) {
89    int16x4_t d0s16, d1s16, d2s16, d3s16;
90    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
91    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
92    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
93    uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
94    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
95    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
96    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
97    int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
98    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
99    int16x8x2_t q0x2s16;
100
101    q0x2s16 = vld2q_s16(in);
102    q8s16  = q0x2s16.val[0];
103    in += 16;
104    q0x2s16 = vld2q_s16(in);
105    q9s16  = q0x2s16.val[0];
106    in += 16;
107    q0x2s16 = vld2q_s16(in);
108    q10s16 = q0x2s16.val[0];
109    in += 16;
110    q0x2s16 = vld2q_s16(in);
111    q11s16 = q0x2s16.val[0];
112    in += 16;
113    q0x2s16 = vld2q_s16(in);
114    q12s16 = q0x2s16.val[0];
115    in += 16;
116    q0x2s16 = vld2q_s16(in);
117    q13s16 = q0x2s16.val[0];
118    in += 16;
119    q0x2s16 = vld2q_s16(in);
120    q14s16 = q0x2s16.val[0];
121    in += 16;
122    q0x2s16 = vld2q_s16(in);
123    q15s16 = q0x2s16.val[0];
124
125    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
126                 &q12s16, &q13s16, &q14s16, &q15s16);
127
128    d16s16 = vget_low_s16(q8s16);
129    d17s16 = vget_high_s16(q8s16);
130    d18s16 = vget_low_s16(q9s16);
131    d19s16 = vget_high_s16(q9s16);
132    d20s16 = vget_low_s16(q10s16);
133    d21s16 = vget_high_s16(q10s16);
134    d22s16 = vget_low_s16(q11s16);
135    d23s16 = vget_high_s16(q11s16);
136    d24s16 = vget_low_s16(q12s16);
137    d25s16 = vget_high_s16(q12s16);
138    d26s16 = vget_low_s16(q13s16);
139    d27s16 = vget_high_s16(q13s16);
140    d28s16 = vget_low_s16(q14s16);
141    d29s16 = vget_high_s16(q14s16);
142    d30s16 = vget_low_s16(q15s16);
143    d31s16 = vget_high_s16(q15s16);
144
145    // stage 3
146    d0s16 = vdup_n_s16(cospi_28_64);
147    d1s16 = vdup_n_s16(cospi_4_64);
148
149    q2s32 = vmull_s16(d18s16, d0s16);
150    q3s32 = vmull_s16(d19s16, d0s16);
151    q5s32 = vmull_s16(d18s16, d1s16);
152    q6s32 = vmull_s16(d19s16, d1s16);
153
154    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
155    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
156    q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
157    q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
158
159    d2s16 = vdup_n_s16(cospi_12_64);
160    d3s16 = vdup_n_s16(cospi_20_64);
161
162    d8s16 = vqrshrn_n_s32(q2s32, 14);
163    d9s16 = vqrshrn_n_s32(q3s32, 14);
164    d14s16 = vqrshrn_n_s32(q5s32, 14);
165    d15s16 = vqrshrn_n_s32(q6s32, 14);
166    q4s16 = vcombine_s16(d8s16, d9s16);
167    q7s16 = vcombine_s16(d14s16, d15s16);
168
169    q2s32 = vmull_s16(d26s16, d2s16);
170    q3s32 = vmull_s16(d27s16, d2s16);
171    q9s32 = vmull_s16(d26s16, d3s16);
172    q15s32 = vmull_s16(d27s16, d3s16);
173
174    q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
175    q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
176    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
177    q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
178
179    d10s16 = vqrshrn_n_s32(q2s32, 14);
180    d11s16 = vqrshrn_n_s32(q3s32, 14);
181    d12s16 = vqrshrn_n_s32(q9s32, 14);
182    d13s16 = vqrshrn_n_s32(q15s32, 14);
183    q5s16 = vcombine_s16(d10s16, d11s16);
184    q6s16 = vcombine_s16(d12s16, d13s16);
185
186    // stage 4
187    d30s16 = vdup_n_s16(cospi_16_64);
188
189    q2s32 = vmull_s16(d16s16, d30s16);
190    q11s32 = vmull_s16(d17s16, d30s16);
191    q0s32 = vmull_s16(d24s16, d30s16);
192    q1s32 = vmull_s16(d25s16, d30s16);
193
194    d30s16 = vdup_n_s16(cospi_24_64);
195    d31s16 = vdup_n_s16(cospi_8_64);
196
197    q3s32 = vaddq_s32(q2s32, q0s32);
198    q12s32 = vaddq_s32(q11s32, q1s32);
199    q13s32 = vsubq_s32(q2s32, q0s32);
200    q1s32 = vsubq_s32(q11s32, q1s32);
201
202    d16s16 = vqrshrn_n_s32(q3s32, 14);
203    d17s16 = vqrshrn_n_s32(q12s32, 14);
204    d18s16 = vqrshrn_n_s32(q13s32, 14);
205    d19s16 = vqrshrn_n_s32(q1s32, 14);
206    q8s16 = vcombine_s16(d16s16, d17s16);
207    q9s16 = vcombine_s16(d18s16, d19s16);
208
209    q0s32 = vmull_s16(d20s16, d31s16);
210    q1s32 = vmull_s16(d21s16, d31s16);
211    q12s32 = vmull_s16(d20s16, d30s16);
212    q13s32 = vmull_s16(d21s16, d30s16);
213
214    q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
215    q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
216    q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
217    q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
218
219    d22s16 = vqrshrn_n_s32(q0s32, 14);
220    d23s16 = vqrshrn_n_s32(q1s32, 14);
221    d20s16 = vqrshrn_n_s32(q12s32, 14);
222    d21s16 = vqrshrn_n_s32(q13s32, 14);
223    q10s16 = vcombine_s16(d20s16, d21s16);
224    q11s16 = vcombine_s16(d22s16, d23s16);
225
226    q13s16 = vsubq_s16(q4s16, q5s16);
227    q4s16 = vaddq_s16(q4s16, q5s16);
228    q14s16 = vsubq_s16(q7s16, q6s16);
229    q15s16 = vaddq_s16(q6s16, q7s16);
230    d26s16 = vget_low_s16(q13s16);
231    d27s16 = vget_high_s16(q13s16);
232    d28s16 = vget_low_s16(q14s16);
233    d29s16 = vget_high_s16(q14s16);
234
235    // stage 5
236    q0s16 = vaddq_s16(q8s16, q11s16);
237    q1s16 = vaddq_s16(q9s16, q10s16);
238    q2s16 = vsubq_s16(q9s16, q10s16);
239    q3s16 = vsubq_s16(q8s16, q11s16);
240
241    d16s16 = vdup_n_s16(cospi_16_64);
242
243    q11s32 = vmull_s16(d26s16, d16s16);
244    q12s32 = vmull_s16(d27s16, d16s16);
245    q9s32 = vmull_s16(d28s16, d16s16);
246    q10s32 = vmull_s16(d29s16, d16s16);
247
248    q6s32 = vsubq_s32(q9s32, q11s32);
249    q13s32 = vsubq_s32(q10s32, q12s32);
250    q9s32 = vaddq_s32(q9s32, q11s32);
251    q10s32 = vaddq_s32(q10s32, q12s32);
252
253    d10s16 = vqrshrn_n_s32(q6s32, 14);
254    d11s16 = vqrshrn_n_s32(q13s32, 14);
255    d12s16 = vqrshrn_n_s32(q9s32, 14);
256    d13s16 = vqrshrn_n_s32(q10s32, 14);
257    q5s16 = vcombine_s16(d10s16, d11s16);
258    q6s16 = vcombine_s16(d12s16, d13s16);
259
260    // stage 6
261    q8s16 = vaddq_s16(q0s16, q15s16);
262    q9s16 = vaddq_s16(q1s16, q6s16);
263    q10s16 = vaddq_s16(q2s16, q5s16);
264    q11s16 = vaddq_s16(q3s16, q4s16);
265    q12s16 = vsubq_s16(q3s16, q4s16);
266    q13s16 = vsubq_s16(q2s16, q5s16);
267    q14s16 = vsubq_s16(q1s16, q6s16);
268    q15s16 = vsubq_s16(q0s16, q15s16);
269
270    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
271    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
272    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
273    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
274    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
275    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
276    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
277    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
278    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
279    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
280    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
281    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
282    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
283    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
284    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
285    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
286
287    // store the data
288    output_stride >>= 1;  // output_stride / 2, out is int16_t
289    vst1_u64((uint64_t *)out, d16u64);
290    out += output_stride;
291    vst1_u64((uint64_t *)out, d17u64);
292    out += output_stride;
293    vst1_u64((uint64_t *)out, d18u64);
294    out += output_stride;
295    vst1_u64((uint64_t *)out, d19u64);
296    out += output_stride;
297    vst1_u64((uint64_t *)out, d20u64);
298    out += output_stride;
299    vst1_u64((uint64_t *)out, d21u64);
300    out += output_stride;
301    vst1_u64((uint64_t *)out, d22u64);
302    out += output_stride;
303    vst1_u64((uint64_t *)out, d23u64);
304    out += output_stride;
305    vst1_u64((uint64_t *)out, d24u64);
306    out += output_stride;
307    vst1_u64((uint64_t *)out, d25u64);
308    out += output_stride;
309    vst1_u64((uint64_t *)out, d26u64);
310    out += output_stride;
311    vst1_u64((uint64_t *)out, d27u64);
312    out += output_stride;
313    vst1_u64((uint64_t *)out, d28u64);
314    out += output_stride;
315    vst1_u64((uint64_t *)out, d29u64);
316    out += output_stride;
317    vst1_u64((uint64_t *)out, d30u64);
318    out += output_stride;
319    vst1_u64((uint64_t *)out, d31u64);
320    return;
321}
322
323void vpx_idct16x16_256_add_neon_pass2(
324        int16_t *src,
325        int16_t *out,
326        int16_t *pass1Output,
327        int16_t skip_adding,
328        uint8_t *dest,
329        int dest_stride) {
330    uint8_t *d;
331    uint8x8_t d12u8, d13u8;
332    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
333    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
334    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
335    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
336    uint64x1_t d24u64, d25u64, d26u64, d27u64;
337    int64x1_t d12s64, d13s64;
338    uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
339    uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
340    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
341    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
342    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
343    int32x4_t q10s32, q11s32, q12s32, q13s32;
344    int16x8x2_t q0x2s16;
345
346    q0x2s16 = vld2q_s16(src);
347    q8s16  = q0x2s16.val[0];
348    src += 16;
349    q0x2s16 = vld2q_s16(src);
350    q9s16  = q0x2s16.val[0];
351    src += 16;
352    q0x2s16 = vld2q_s16(src);
353    q10s16 = q0x2s16.val[0];
354    src += 16;
355    q0x2s16 = vld2q_s16(src);
356    q11s16 = q0x2s16.val[0];
357    src += 16;
358    q0x2s16 = vld2q_s16(src);
359    q12s16 = q0x2s16.val[0];
360    src += 16;
361    q0x2s16 = vld2q_s16(src);
362    q13s16 = q0x2s16.val[0];
363    src += 16;
364    q0x2s16 = vld2q_s16(src);
365    q14s16 = q0x2s16.val[0];
366    src += 16;
367    q0x2s16 = vld2q_s16(src);
368    q15s16 = q0x2s16.val[0];
369
370    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
371                 &q12s16, &q13s16, &q14s16, &q15s16);
372
373    d16s16 = vget_low_s16(q8s16);
374    d17s16 = vget_high_s16(q8s16);
375    d18s16 = vget_low_s16(q9s16);
376    d19s16 = vget_high_s16(q9s16);
377    d20s16 = vget_low_s16(q10s16);
378    d21s16 = vget_high_s16(q10s16);
379    d22s16 = vget_low_s16(q11s16);
380    d23s16 = vget_high_s16(q11s16);
381    d24s16 = vget_low_s16(q12s16);
382    d25s16 = vget_high_s16(q12s16);
383    d26s16 = vget_low_s16(q13s16);
384    d27s16 = vget_high_s16(q13s16);
385    d28s16 = vget_low_s16(q14s16);
386    d29s16 = vget_high_s16(q14s16);
387    d30s16 = vget_low_s16(q15s16);
388    d31s16 = vget_high_s16(q15s16);
389
390    // stage 3
391    d12s16 = vdup_n_s16(cospi_30_64);
392    d13s16 = vdup_n_s16(cospi_2_64);
393
394    q2s32 = vmull_s16(d16s16, d12s16);
395    q3s32 = vmull_s16(d17s16, d12s16);
396    q1s32 = vmull_s16(d16s16, d13s16);
397    q4s32 = vmull_s16(d17s16, d13s16);
398
399    q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
400    q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
401    q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
402    q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
403
404    d0s16 = vqrshrn_n_s32(q2s32, 14);
405    d1s16 = vqrshrn_n_s32(q3s32, 14);
406    d14s16 = vqrshrn_n_s32(q1s32, 14);
407    d15s16 = vqrshrn_n_s32(q4s32, 14);
408    q0s16 = vcombine_s16(d0s16, d1s16);
409    q7s16 = vcombine_s16(d14s16, d15s16);
410
411    d30s16 = vdup_n_s16(cospi_14_64);
412    d31s16 = vdup_n_s16(cospi_18_64);
413
414    q2s32 = vmull_s16(d24s16, d30s16);
415    q3s32 = vmull_s16(d25s16, d30s16);
416    q4s32 = vmull_s16(d24s16, d31s16);
417    q5s32 = vmull_s16(d25s16, d31s16);
418
419    q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
420    q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
421    q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
422    q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
423
424    d2s16 = vqrshrn_n_s32(q2s32, 14);
425    d3s16 = vqrshrn_n_s32(q3s32, 14);
426    d12s16 = vqrshrn_n_s32(q4s32, 14);
427    d13s16 = vqrshrn_n_s32(q5s32, 14);
428    q1s16 = vcombine_s16(d2s16, d3s16);
429    q6s16 = vcombine_s16(d12s16, d13s16);
430
431    d30s16 = vdup_n_s16(cospi_22_64);
432    d31s16 = vdup_n_s16(cospi_10_64);
433
434    q11s32 = vmull_s16(d20s16, d30s16);
435    q12s32 = vmull_s16(d21s16, d30s16);
436    q4s32 = vmull_s16(d20s16, d31s16);
437    q5s32 = vmull_s16(d21s16, d31s16);
438
439    q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
440    q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
441    q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
442    q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
443
444    d4s16 = vqrshrn_n_s32(q11s32, 14);
445    d5s16 = vqrshrn_n_s32(q12s32, 14);
446    d11s16 = vqrshrn_n_s32(q5s32, 14);
447    d10s16 = vqrshrn_n_s32(q4s32, 14);
448    q2s16 = vcombine_s16(d4s16, d5s16);
449    q5s16 = vcombine_s16(d10s16, d11s16);
450
451    d30s16 = vdup_n_s16(cospi_6_64);
452    d31s16 = vdup_n_s16(cospi_26_64);
453
454    q10s32 = vmull_s16(d28s16, d30s16);
455    q11s32 = vmull_s16(d29s16, d30s16);
456    q12s32 = vmull_s16(d28s16, d31s16);
457    q13s32 = vmull_s16(d29s16, d31s16);
458
459    q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
460    q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
461    q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
462    q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
463
464    d6s16 = vqrshrn_n_s32(q10s32, 14);
465    d7s16 = vqrshrn_n_s32(q11s32, 14);
466    d8s16 = vqrshrn_n_s32(q12s32, 14);
467    d9s16 = vqrshrn_n_s32(q13s32, 14);
468    q3s16 = vcombine_s16(d6s16, d7s16);
469    q4s16 = vcombine_s16(d8s16, d9s16);
470
471    // stage 3
472    q9s16  = vsubq_s16(q0s16, q1s16);
473    q0s16  = vaddq_s16(q0s16, q1s16);
474    q10s16 = vsubq_s16(q3s16, q2s16);
475    q11s16 = vaddq_s16(q2s16, q3s16);
476    q12s16 = vaddq_s16(q4s16, q5s16);
477    q13s16 = vsubq_s16(q4s16, q5s16);
478    q14s16 = vsubq_s16(q7s16, q6s16);
479    q7s16  = vaddq_s16(q6s16, q7s16);
480
481    // stage 4
482    d18s16 = vget_low_s16(q9s16);
483    d19s16 = vget_high_s16(q9s16);
484    d20s16 = vget_low_s16(q10s16);
485    d21s16 = vget_high_s16(q10s16);
486    d26s16 = vget_low_s16(q13s16);
487    d27s16 = vget_high_s16(q13s16);
488    d28s16 = vget_low_s16(q14s16);
489    d29s16 = vget_high_s16(q14s16);
490
491    d30s16 = vdup_n_s16(cospi_8_64);
492    d31s16 = vdup_n_s16(cospi_24_64);
493
494    q2s32 = vmull_s16(d18s16, d31s16);
495    q3s32 = vmull_s16(d19s16, d31s16);
496    q4s32 = vmull_s16(d28s16, d31s16);
497    q5s32 = vmull_s16(d29s16, d31s16);
498
499    q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
500    q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
501    q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
502    q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
503
504    d12s16 = vqrshrn_n_s32(q2s32, 14);
505    d13s16 = vqrshrn_n_s32(q3s32, 14);
506    d2s16 = vqrshrn_n_s32(q4s32, 14);
507    d3s16 = vqrshrn_n_s32(q5s32, 14);
508    q1s16 = vcombine_s16(d2s16, d3s16);
509    q6s16 = vcombine_s16(d12s16, d13s16);
510
511    q3s16 = q11s16;
512    q4s16 = q12s16;
513
514    d30s16 = vdup_n_s16(-cospi_8_64);
515    q11s32 = vmull_s16(d26s16, d30s16);
516    q12s32 = vmull_s16(d27s16, d30s16);
517    q8s32 = vmull_s16(d20s16, d30s16);
518    q9s32 = vmull_s16(d21s16, d30s16);
519
520    q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
521    q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
522    q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
523    q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
524
525    d4s16 = vqrshrn_n_s32(q11s32, 14);
526    d5s16 = vqrshrn_n_s32(q12s32, 14);
527    d10s16 = vqrshrn_n_s32(q8s32, 14);
528    d11s16 = vqrshrn_n_s32(q9s32, 14);
529    q2s16 = vcombine_s16(d4s16, d5s16);
530    q5s16 = vcombine_s16(d10s16, d11s16);
531
532    // stage 5
533    q8s16  = vaddq_s16(q0s16, q3s16);
534    q9s16  = vaddq_s16(q1s16, q2s16);
535    q10s16 = vsubq_s16(q1s16, q2s16);
536    q11s16 = vsubq_s16(q0s16, q3s16);
537    q12s16 = vsubq_s16(q7s16, q4s16);
538    q13s16 = vsubq_s16(q6s16, q5s16);
539    q14s16 = vaddq_s16(q6s16, q5s16);
540    q15s16 = vaddq_s16(q7s16, q4s16);
541
542    // stage 6
543    d20s16 = vget_low_s16(q10s16);
544    d21s16 = vget_high_s16(q10s16);
545    d22s16 = vget_low_s16(q11s16);
546    d23s16 = vget_high_s16(q11s16);
547    d24s16 = vget_low_s16(q12s16);
548    d25s16 = vget_high_s16(q12s16);
549    d26s16 = vget_low_s16(q13s16);
550    d27s16 = vget_high_s16(q13s16);
551
552    d14s16 = vdup_n_s16(cospi_16_64);
553
554    q3s32 = vmull_s16(d26s16, d14s16);
555    q4s32 = vmull_s16(d27s16, d14s16);
556    q0s32 = vmull_s16(d20s16, d14s16);
557    q1s32 = vmull_s16(d21s16, d14s16);
558
559    q5s32 = vsubq_s32(q3s32, q0s32);
560    q6s32 = vsubq_s32(q4s32, q1s32);
561    q10s32 = vaddq_s32(q3s32, q0s32);
562    q4s32 = vaddq_s32(q4s32, q1s32);
563
564    d4s16 = vqrshrn_n_s32(q5s32, 14);
565    d5s16 = vqrshrn_n_s32(q6s32, 14);
566    d10s16 = vqrshrn_n_s32(q10s32, 14);
567    d11s16 = vqrshrn_n_s32(q4s32, 14);
568    q2s16 = vcombine_s16(d4s16, d5s16);
569    q5s16 = vcombine_s16(d10s16, d11s16);
570
571    q0s32 = vmull_s16(d22s16, d14s16);
572    q1s32 = vmull_s16(d23s16, d14s16);
573    q13s32 = vmull_s16(d24s16, d14s16);
574    q6s32 = vmull_s16(d25s16, d14s16);
575
576    q10s32 = vsubq_s32(q13s32, q0s32);
577    q4s32 = vsubq_s32(q6s32, q1s32);
578    q13s32 = vaddq_s32(q13s32, q0s32);
579    q6s32 = vaddq_s32(q6s32, q1s32);
580
581    d6s16 = vqrshrn_n_s32(q10s32, 14);
582    d7s16 = vqrshrn_n_s32(q4s32, 14);
583    d8s16 = vqrshrn_n_s32(q13s32, 14);
584    d9s16 = vqrshrn_n_s32(q6s32, 14);
585    q3s16 = vcombine_s16(d6s16, d7s16);
586    q4s16 = vcombine_s16(d8s16, d9s16);
587
588    // stage 7
589    if (skip_adding != 0) {
590        d = dest;
591        // load the data in pass1
592        q0s16 = vld1q_s16(pass1Output);
593        pass1Output += 8;
594        q1s16 = vld1q_s16(pass1Output);
595        pass1Output += 8;
596        d12s64 = vld1_s64((int64_t *)dest);
597        dest += dest_stride;
598        d13s64 = vld1_s64((int64_t *)dest);
599        dest += dest_stride;
600
601        q12s16 = vaddq_s16(q0s16, q15s16);
602        q13s16 = vaddq_s16(q1s16, q14s16);
603        q12s16 = vrshrq_n_s16(q12s16, 6);
604        q13s16 = vrshrq_n_s16(q13s16, 6);
605        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
606                          vreinterpret_u8_s64(d12s64));
607        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
608                          vreinterpret_u8_s64(d13s64));
609        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
610        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
611        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
612        d += dest_stride;
613        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
614        d += dest_stride;
615        q14s16 = vsubq_s16(q1s16, q14s16);
616        q15s16 = vsubq_s16(q0s16, q15s16);
617
618        q10s16 = vld1q_s16(pass1Output);
619        pass1Output += 8;
620        q11s16 = vld1q_s16(pass1Output);
621        pass1Output += 8;
622        d12s64 = vld1_s64((int64_t *)dest);
623        dest += dest_stride;
624        d13s64 = vld1_s64((int64_t *)dest);
625        dest += dest_stride;
626        q12s16 = vaddq_s16(q10s16, q5s16);
627        q13s16 = vaddq_s16(q11s16, q4s16);
628        q12s16 = vrshrq_n_s16(q12s16, 6);
629        q13s16 = vrshrq_n_s16(q13s16, 6);
630        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
631                          vreinterpret_u8_s64(d12s64));
632        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
633                          vreinterpret_u8_s64(d13s64));
634        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
635        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
636        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
637        d += dest_stride;
638        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
639        d += dest_stride;
640        q4s16 = vsubq_s16(q11s16, q4s16);
641        q5s16 = vsubq_s16(q10s16, q5s16);
642
643        q0s16 = vld1q_s16(pass1Output);
644        pass1Output += 8;
645        q1s16 = vld1q_s16(pass1Output);
646        pass1Output += 8;
647        d12s64 = vld1_s64((int64_t *)dest);
648        dest += dest_stride;
649        d13s64 = vld1_s64((int64_t *)dest);
650        dest += dest_stride;
651        q12s16 = vaddq_s16(q0s16, q3s16);
652        q13s16 = vaddq_s16(q1s16, q2s16);
653        q12s16 = vrshrq_n_s16(q12s16, 6);
654        q13s16 = vrshrq_n_s16(q13s16, 6);
655        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
656                          vreinterpret_u8_s64(d12s64));
657        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
658                          vreinterpret_u8_s64(d13s64));
659        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
660        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
661        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
662        d += dest_stride;
663        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
664        d += dest_stride;
665        q2s16 = vsubq_s16(q1s16, q2s16);
666        q3s16 = vsubq_s16(q0s16, q3s16);
667
668        q10s16 = vld1q_s16(pass1Output);
669        pass1Output += 8;
670        q11s16 = vld1q_s16(pass1Output);
671        d12s64 = vld1_s64((int64_t *)dest);
672        dest += dest_stride;
673        d13s64 = vld1_s64((int64_t *)dest);
674        dest += dest_stride;
675        q12s16 = vaddq_s16(q10s16, q9s16);
676        q13s16 = vaddq_s16(q11s16, q8s16);
677        q12s16 = vrshrq_n_s16(q12s16, 6);
678        q13s16 = vrshrq_n_s16(q13s16, 6);
679        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
680                          vreinterpret_u8_s64(d12s64));
681        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
682                          vreinterpret_u8_s64(d13s64));
683        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
684        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
685        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
686        d += dest_stride;
687        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
688        d += dest_stride;
689        q8s16 = vsubq_s16(q11s16, q8s16);
690        q9s16 = vsubq_s16(q10s16, q9s16);
691
692        // store the data  out 8,9,10,11,12,13,14,15
693        d12s64 = vld1_s64((int64_t *)dest);
694        dest += dest_stride;
695        q8s16 = vrshrq_n_s16(q8s16, 6);
696        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
697                         vreinterpret_u8_s64(d12s64));
698        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
699        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
700        d += dest_stride;
701
702        d12s64 = vld1_s64((int64_t *)dest);
703        dest += dest_stride;
704        q9s16 = vrshrq_n_s16(q9s16, 6);
705        q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
706                          vreinterpret_u8_s64(d12s64));
707        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
708        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
709        d += dest_stride;
710
711        d12s64 = vld1_s64((int64_t *)dest);
712        dest += dest_stride;
713        q2s16 = vrshrq_n_s16(q2s16, 6);
714        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
715                          vreinterpret_u8_s64(d12s64));
716        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
717        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
718        d += dest_stride;
719
720        d12s64 = vld1_s64((int64_t *)dest);
721        dest += dest_stride;
722        q3s16 = vrshrq_n_s16(q3s16, 6);
723        q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
724                         vreinterpret_u8_s64(d12s64));
725        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
726        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
727        d += dest_stride;
728
729        d12s64 = vld1_s64((int64_t *)dest);
730        dest += dest_stride;
731        q4s16 = vrshrq_n_s16(q4s16, 6);
732        q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
733                         vreinterpret_u8_s64(d12s64));
734        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
735        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
736        d += dest_stride;
737
738        d12s64 = vld1_s64((int64_t *)dest);
739        dest += dest_stride;
740        q5s16 = vrshrq_n_s16(q5s16, 6);
741        q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
742                         vreinterpret_u8_s64(d12s64));
743        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
744        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
745        d += dest_stride;
746
747        d12s64 = vld1_s64((int64_t *)dest);
748        dest += dest_stride;
749        q14s16 = vrshrq_n_s16(q14s16, 6);
750        q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
751                          vreinterpret_u8_s64(d12s64));
752        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
753        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
754        d += dest_stride;
755
756        d12s64 = vld1_s64((int64_t *)dest);
757        q15s16 = vrshrq_n_s16(q15s16, 6);
758        q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
759                          vreinterpret_u8_s64(d12s64));
760        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
761        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
762    } else {  // skip_adding_dest
763        q0s16 = vld1q_s16(pass1Output);
764        pass1Output += 8;
765        q1s16 = vld1q_s16(pass1Output);
766        pass1Output += 8;
767        q12s16 = vaddq_s16(q0s16, q15s16);
768        q13s16 = vaddq_s16(q1s16, q14s16);
769        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
770        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
771        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
772        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
773        vst1_u64((uint64_t *)out, d24u64);
774        out += 4;
775        vst1_u64((uint64_t *)out, d25u64);
776        out += 12;
777        vst1_u64((uint64_t *)out, d26u64);
778        out += 4;
779        vst1_u64((uint64_t *)out, d27u64);
780        out += 12;
781        q14s16 = vsubq_s16(q1s16, q14s16);
782        q15s16 = vsubq_s16(q0s16, q15s16);
783
784        q10s16 = vld1q_s16(pass1Output);
785        pass1Output += 8;
786        q11s16 = vld1q_s16(pass1Output);
787        pass1Output += 8;
788        q12s16 = vaddq_s16(q10s16, q5s16);
789        q13s16 = vaddq_s16(q11s16, q4s16);
790        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
791        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
792        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
793        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
794        vst1_u64((uint64_t *)out, d24u64);
795        out += 4;
796        vst1_u64((uint64_t *)out, d25u64);
797        out += 12;
798        vst1_u64((uint64_t *)out, d26u64);
799        out += 4;
800        vst1_u64((uint64_t *)out, d27u64);
801        out += 12;
802        q4s16 = vsubq_s16(q11s16, q4s16);
803        q5s16 = vsubq_s16(q10s16, q5s16);
804
805        q0s16 = vld1q_s16(pass1Output);
806        pass1Output += 8;
807        q1s16 = vld1q_s16(pass1Output);
808        pass1Output += 8;
809        q12s16 = vaddq_s16(q0s16, q3s16);
810        q13s16 = vaddq_s16(q1s16, q2s16);
811        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
812        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
813        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
814        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
815        vst1_u64((uint64_t *)out, d24u64);
816        out += 4;
817        vst1_u64((uint64_t *)out, d25u64);
818        out += 12;
819        vst1_u64((uint64_t *)out, d26u64);
820        out += 4;
821        vst1_u64((uint64_t *)out, d27u64);
822        out += 12;
823        q2s16 = vsubq_s16(q1s16, q2s16);
824        q3s16 = vsubq_s16(q0s16, q3s16);
825
826        q10s16 = vld1q_s16(pass1Output);
827        pass1Output += 8;
828        q11s16 = vld1q_s16(pass1Output);
829        pass1Output += 8;
830        q12s16 = vaddq_s16(q10s16, q9s16);
831        q13s16 = vaddq_s16(q11s16, q8s16);
832        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
833        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
834        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
835        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
836        vst1_u64((uint64_t *)out, d24u64);
837        out += 4;
838        vst1_u64((uint64_t *)out, d25u64);
839        out += 12;
840        vst1_u64((uint64_t *)out, d26u64);
841        out += 4;
842        vst1_u64((uint64_t *)out, d27u64);
843        out += 12;
844        q8s16 = vsubq_s16(q11s16, q8s16);
845        q9s16 = vsubq_s16(q10s16, q9s16);
846
847        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
848        out += 4;
849        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
850        out += 12;
851        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
852        out += 4;
853        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
854        out += 12;
855        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
856        out += 4;
857        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
858        out += 12;
859        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
860        out += 4;
861        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
862        out += 12;
863        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
864        out += 4;
865        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
866        out += 12;
867        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
868        out += 4;
869        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
870        out += 12;
871        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
872        out += 4;
873        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
874        out += 12;
875        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
876        out += 4;
877        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
878    }
879    return;
880}
881
882void vpx_idct16x16_10_add_neon_pass1(
883        int16_t *in,
884        int16_t *out,
885        int output_stride) {
886    int16x4_t d4s16;
887    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
888    uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
889    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
890    int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
891    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
892    int32x4_t q6s32, q9s32;
893    int32x4_t q10s32, q11s32, q12s32, q15s32;
894    int16x8x2_t q0x2s16;
895
896    q0x2s16 = vld2q_s16(in);
897    q8s16 = q0x2s16.val[0];
898    in += 16;
899    q0x2s16 = vld2q_s16(in);
900    q9s16 = q0x2s16.val[0];
901    in += 16;
902    q0x2s16 = vld2q_s16(in);
903    q10s16 = q0x2s16.val[0];
904    in += 16;
905    q0x2s16 = vld2q_s16(in);
906    q11s16 = q0x2s16.val[0];
907    in += 16;
908    q0x2s16 = vld2q_s16(in);
909    q12s16 = q0x2s16.val[0];
910    in += 16;
911    q0x2s16 = vld2q_s16(in);
912    q13s16 = q0x2s16.val[0];
913    in += 16;
914    q0x2s16 = vld2q_s16(in);
915    q14s16 = q0x2s16.val[0];
916    in += 16;
917    q0x2s16 = vld2q_s16(in);
918    q15s16 = q0x2s16.val[0];
919
920    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
921                 &q12s16, &q13s16, &q14s16, &q15s16);
922
923    // stage 3
924    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
925    q1s16 = vdupq_n_s16(cospi_4_64 * 2);
926
927    q4s16 = vqrdmulhq_s16(q9s16, q0s16);
928    q7s16 = vqrdmulhq_s16(q9s16, q1s16);
929
930    // stage 4
931    q1s16 = vdupq_n_s16(cospi_16_64 * 2);
932    d4s16 = vdup_n_s16(cospi_16_64);
933
934    q8s16 = vqrdmulhq_s16(q8s16, q1s16);
935
936    d8s16 = vget_low_s16(q4s16);
937    d9s16 = vget_high_s16(q4s16);
938    d14s16 = vget_low_s16(q7s16);
939    d15s16 = vget_high_s16(q7s16);
940    q9s32  = vmull_s16(d14s16, d4s16);
941    q10s32 = vmull_s16(d15s16, d4s16);
942    q12s32 = vmull_s16(d9s16, d4s16);
943    q11s32 = vmull_s16(d8s16, d4s16);
944
945    q15s32 = vsubq_s32(q10s32, q12s32);
946    q6s32 = vsubq_s32(q9s32, q11s32);
947    q9s32 = vaddq_s32(q9s32, q11s32);
948    q10s32 = vaddq_s32(q10s32, q12s32);
949
950    d11s16 = vqrshrn_n_s32(q15s32, 14);
951    d10s16 = vqrshrn_n_s32(q6s32, 14);
952    d12s16 = vqrshrn_n_s32(q9s32, 14);
953    d13s16 = vqrshrn_n_s32(q10s32, 14);
954    q5s16 = vcombine_s16(d10s16, d11s16);
955    q6s16 = vcombine_s16(d12s16, d13s16);
956
957    // stage 6
958    q2s16 = vaddq_s16(q8s16, q7s16);
959    q9s16 = vaddq_s16(q8s16, q6s16);
960    q10s16 = vaddq_s16(q8s16, q5s16);
961    q11s16 = vaddq_s16(q8s16, q4s16);
962    q12s16 = vsubq_s16(q8s16, q4s16);
963    q13s16 = vsubq_s16(q8s16, q5s16);
964    q14s16 = vsubq_s16(q8s16, q6s16);
965    q15s16 = vsubq_s16(q8s16, q7s16);
966
967    d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
968    d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
969    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
970    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
971    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
972    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
973    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
974    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
975    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
976    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
977    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
978    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
979    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
980    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
981    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
982    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
983
984    // store the data
985    output_stride >>= 1;  // output_stride / 2, out is int16_t
986    vst1_u64((uint64_t *)out, d4u64);
987    out += output_stride;
988    vst1_u64((uint64_t *)out, d5u64);
989    out += output_stride;
990    vst1_u64((uint64_t *)out, d18u64);
991    out += output_stride;
992    vst1_u64((uint64_t *)out, d19u64);
993    out += output_stride;
994    vst1_u64((uint64_t *)out, d20u64);
995    out += output_stride;
996    vst1_u64((uint64_t *)out, d21u64);
997    out += output_stride;
998    vst1_u64((uint64_t *)out, d22u64);
999    out += output_stride;
1000    vst1_u64((uint64_t *)out, d23u64);
1001    out += output_stride;
1002    vst1_u64((uint64_t *)out, d24u64);
1003    out += output_stride;
1004    vst1_u64((uint64_t *)out, d25u64);
1005    out += output_stride;
1006    vst1_u64((uint64_t *)out, d26u64);
1007    out += output_stride;
1008    vst1_u64((uint64_t *)out, d27u64);
1009    out += output_stride;
1010    vst1_u64((uint64_t *)out, d28u64);
1011    out += output_stride;
1012    vst1_u64((uint64_t *)out, d29u64);
1013    out += output_stride;
1014    vst1_u64((uint64_t *)out, d30u64);
1015    out += output_stride;
1016    vst1_u64((uint64_t *)out, d31u64);
1017    return;
1018}
1019
1020void vpx_idct16x16_10_add_neon_pass2(
1021        int16_t *src,
1022        int16_t *out,
1023        int16_t *pass1Output,
1024        int16_t skip_adding,
1025        uint8_t *dest,
1026        int dest_stride) {
1027    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
1028    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
1029    int16x4_t d20s16, d21s16, d22s16, d23s16;
1030    int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
1031    uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
1032    uint64x1_t d16u64, d17u64, d18u64, d19u64;
1033    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
1034    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
1035    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
1036    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
1037    int32x4_t q10s32, q11s32, q12s32, q13s32;
1038    int16x8x2_t q0x2s16;
1039    (void)skip_adding;
1040    (void)dest;
1041    (void)dest_stride;
1042
1043    q0x2s16 = vld2q_s16(src);
1044    q8s16 = q0x2s16.val[0];
1045    src += 16;
1046    q0x2s16 = vld2q_s16(src);
1047    q9s16 = q0x2s16.val[0];
1048    src += 16;
1049    q0x2s16 = vld2q_s16(src);
1050    q10s16 = q0x2s16.val[0];
1051    src += 16;
1052    q0x2s16 = vld2q_s16(src);
1053    q11s16 = q0x2s16.val[0];
1054    src += 16;
1055    q0x2s16 = vld2q_s16(src);
1056    q12s16 = q0x2s16.val[0];
1057    src += 16;
1058    q0x2s16 = vld2q_s16(src);
1059    q13s16 = q0x2s16.val[0];
1060    src += 16;
1061    q0x2s16 = vld2q_s16(src);
1062    q14s16 = q0x2s16.val[0];
1063    src += 16;
1064    q0x2s16 = vld2q_s16(src);
1065    q15s16 = q0x2s16.val[0];
1066
1067    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
1068                 &q12s16, &q13s16, &q14s16, &q15s16);
1069
1070    // stage 3
1071    q6s16 = vdupq_n_s16(cospi_30_64 * 2);
1072    q0s16 = vqrdmulhq_s16(q8s16, q6s16);
1073    q6s16 = vdupq_n_s16(cospi_2_64 * 2);
1074    q7s16 = vqrdmulhq_s16(q8s16, q6s16);
1075
1076    q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
1077    q14s16 = vdupq_n_s16(cospi_6_64 * 2);
1078    q3s16 = vqrdmulhq_s16(q9s16, q15s16);
1079    q4s16 = vqrdmulhq_s16(q9s16, q14s16);
1080
1081    // stage 4
1082    d0s16 = vget_low_s16(q0s16);
1083    d1s16 = vget_high_s16(q0s16);
1084    d6s16 = vget_low_s16(q3s16);
1085    d7s16 = vget_high_s16(q3s16);
1086    d8s16 = vget_low_s16(q4s16);
1087    d9s16 = vget_high_s16(q4s16);
1088    d14s16 = vget_low_s16(q7s16);
1089    d15s16 = vget_high_s16(q7s16);
1090
1091    d30s16 = vdup_n_s16(cospi_8_64);
1092    d31s16 = vdup_n_s16(cospi_24_64);
1093
1094    q12s32 = vmull_s16(d14s16, d31s16);
1095    q5s32 = vmull_s16(d15s16, d31s16);
1096    q2s32 = vmull_s16(d0s16, d31s16);
1097    q11s32 = vmull_s16(d1s16, d31s16);
1098
1099    q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
1100    q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
1101    q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
1102    q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
1103
1104    d2s16 = vqrshrn_n_s32(q12s32, 14);
1105    d3s16 = vqrshrn_n_s32(q5s32, 14);
1106    d12s16 = vqrshrn_n_s32(q2s32, 14);
1107    d13s16 = vqrshrn_n_s32(q11s32, 14);
1108    q1s16 = vcombine_s16(d2s16, d3s16);
1109    q6s16 = vcombine_s16(d12s16, d13s16);
1110
1111    d30s16 = vdup_n_s16(-cospi_8_64);
1112    q10s32 = vmull_s16(d8s16, d30s16);
1113    q13s32 = vmull_s16(d9s16, d30s16);
1114    q8s32 = vmull_s16(d6s16, d30s16);
1115    q9s32 = vmull_s16(d7s16, d30s16);
1116
1117    q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
1118    q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
1119    q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
1120    q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
1121
1122    d4s16 = vqrshrn_n_s32(q10s32, 14);
1123    d5s16 = vqrshrn_n_s32(q13s32, 14);
1124    d10s16 = vqrshrn_n_s32(q8s32, 14);
1125    d11s16 = vqrshrn_n_s32(q9s32, 14);
1126    q2s16 = vcombine_s16(d4s16, d5s16);
1127    q5s16 = vcombine_s16(d10s16, d11s16);
1128
1129    // stage 5
1130    q8s16  = vaddq_s16(q0s16, q3s16);
1131    q9s16  = vaddq_s16(q1s16, q2s16);
1132    q10s16 = vsubq_s16(q1s16, q2s16);
1133    q11s16 = vsubq_s16(q0s16, q3s16);
1134    q12s16 = vsubq_s16(q7s16, q4s16);
1135    q13s16 = vsubq_s16(q6s16, q5s16);
1136    q14s16 = vaddq_s16(q6s16, q5s16);
1137    q15s16 = vaddq_s16(q7s16, q4s16);
1138
1139    // stage 6
1140    d20s16 = vget_low_s16(q10s16);
1141    d21s16 = vget_high_s16(q10s16);
1142    d22s16 = vget_low_s16(q11s16);
1143    d23s16 = vget_high_s16(q11s16);
1144    d24s16 = vget_low_s16(q12s16);
1145    d25s16 = vget_high_s16(q12s16);
1146    d26s16 = vget_low_s16(q13s16);
1147    d27s16 = vget_high_s16(q13s16);
1148
1149    d14s16 = vdup_n_s16(cospi_16_64);
1150    q3s32 = vmull_s16(d26s16, d14s16);
1151    q4s32 = vmull_s16(d27s16, d14s16);
1152    q0s32 = vmull_s16(d20s16, d14s16);
1153    q1s32 = vmull_s16(d21s16, d14s16);
1154
1155    q5s32 = vsubq_s32(q3s32, q0s32);
1156    q6s32 = vsubq_s32(q4s32, q1s32);
1157    q0s32 = vaddq_s32(q3s32, q0s32);
1158    q4s32 = vaddq_s32(q4s32, q1s32);
1159
1160    d4s16 = vqrshrn_n_s32(q5s32, 14);
1161    d5s16 = vqrshrn_n_s32(q6s32, 14);
1162    d10s16 = vqrshrn_n_s32(q0s32, 14);
1163    d11s16 = vqrshrn_n_s32(q4s32, 14);
1164    q2s16 = vcombine_s16(d4s16, d5s16);
1165    q5s16 = vcombine_s16(d10s16, d11s16);
1166
1167    q0s32 = vmull_s16(d22s16, d14s16);
1168    q1s32 = vmull_s16(d23s16, d14s16);
1169    q13s32 = vmull_s16(d24s16, d14s16);
1170    q6s32 = vmull_s16(d25s16, d14s16);
1171
1172    q10s32 = vsubq_s32(q13s32, q0s32);
1173    q4s32 = vsubq_s32(q6s32, q1s32);
1174    q13s32 = vaddq_s32(q13s32, q0s32);
1175    q6s32 = vaddq_s32(q6s32, q1s32);
1176
1177    d6s16 = vqrshrn_n_s32(q10s32, 14);
1178    d7s16 = vqrshrn_n_s32(q4s32, 14);
1179    d8s16 = vqrshrn_n_s32(q13s32, 14);
1180    d9s16 = vqrshrn_n_s32(q6s32, 14);
1181    q3s16 = vcombine_s16(d6s16, d7s16);
1182    q4s16 = vcombine_s16(d8s16, d9s16);
1183
1184    // stage 7
1185    q0s16 = vld1q_s16(pass1Output);
1186    pass1Output += 8;
1187    q1s16 = vld1q_s16(pass1Output);
1188    pass1Output += 8;
1189    q12s16 = vaddq_s16(q0s16, q15s16);
1190    q13s16 = vaddq_s16(q1s16, q14s16);
1191    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1192    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1193    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1194    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1195    vst1_u64((uint64_t *)out, d24u64);
1196    out += 4;
1197    vst1_u64((uint64_t *)out, d25u64);
1198    out += 12;
1199    vst1_u64((uint64_t *)out, d26u64);
1200    out += 4;
1201    vst1_u64((uint64_t *)out, d27u64);
1202    out += 12;
1203    q14s16 = vsubq_s16(q1s16, q14s16);
1204    q15s16 = vsubq_s16(q0s16, q15s16);
1205
1206    q10s16 = vld1q_s16(pass1Output);
1207    pass1Output += 8;
1208    q11s16 = vld1q_s16(pass1Output);
1209    pass1Output += 8;
1210    q12s16 = vaddq_s16(q10s16, q5s16);
1211    q13s16 = vaddq_s16(q11s16, q4s16);
1212    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1213    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1214    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1215    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1216    vst1_u64((uint64_t *)out, d24u64);
1217    out += 4;
1218    vst1_u64((uint64_t *)out, d25u64);
1219    out += 12;
1220    vst1_u64((uint64_t *)out, d26u64);
1221    out += 4;
1222    vst1_u64((uint64_t *)out, d27u64);
1223    out += 12;
1224    q4s16 = vsubq_s16(q11s16, q4s16);
1225    q5s16 = vsubq_s16(q10s16, q5s16);
1226
1227    q0s16 = vld1q_s16(pass1Output);
1228    pass1Output += 8;
1229    q1s16 = vld1q_s16(pass1Output);
1230    pass1Output += 8;
1231    q12s16 = vaddq_s16(q0s16, q3s16);
1232    q13s16 = vaddq_s16(q1s16, q2s16);
1233    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1234    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1235    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1236    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1237    vst1_u64((uint64_t *)out, d24u64);
1238    out += 4;
1239    vst1_u64((uint64_t *)out, d25u64);
1240    out += 12;
1241    vst1_u64((uint64_t *)out, d26u64);
1242    out += 4;
1243    vst1_u64((uint64_t *)out, d27u64);
1244    out += 12;
1245    q2s16 = vsubq_s16(q1s16, q2s16);
1246    q3s16 = vsubq_s16(q0s16, q3s16);
1247
1248    q10s16 = vld1q_s16(pass1Output);
1249    pass1Output += 8;
1250    q11s16 = vld1q_s16(pass1Output);
1251    q12s16 = vaddq_s16(q10s16, q9s16);
1252    q13s16 = vaddq_s16(q11s16, q8s16);
1253    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1254    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1255    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1256    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1257    vst1_u64((uint64_t *)out, d24u64);
1258    out += 4;
1259    vst1_u64((uint64_t *)out, d25u64);
1260    out += 12;
1261    vst1_u64((uint64_t *)out, d26u64);
1262    out += 4;
1263    vst1_u64((uint64_t *)out, d27u64);
1264    out += 12;
1265    q8s16 = vsubq_s16(q11s16, q8s16);
1266    q9s16 = vsubq_s16(q10s16, q9s16);
1267
1268    d4u64  = vreinterpret_u64_s16(vget_low_s16(q2s16));
1269    d5u64  = vreinterpret_u64_s16(vget_high_s16(q2s16));
1270    d6u64  = vreinterpret_u64_s16(vget_low_s16(q3s16));
1271    d7u64  = vreinterpret_u64_s16(vget_high_s16(q3s16));
1272    d8u64  = vreinterpret_u64_s16(vget_low_s16(q4s16));
1273    d9u64  = vreinterpret_u64_s16(vget_high_s16(q4s16));
1274    d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
1275    d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
1276    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
1277    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
1278    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
1279    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
1280    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
1281    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
1282    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
1283    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
1284
1285    vst1_u64((uint64_t *)out, d16u64);
1286    out += 4;
1287    vst1_u64((uint64_t *)out, d17u64);
1288    out += 12;
1289    vst1_u64((uint64_t *)out, d18u64);
1290    out += 4;
1291    vst1_u64((uint64_t *)out, d19u64);
1292    out += 12;
1293    vst1_u64((uint64_t *)out, d4u64);
1294    out += 4;
1295    vst1_u64((uint64_t *)out, d5u64);
1296    out += 12;
1297    vst1_u64((uint64_t *)out, d6u64);
1298    out += 4;
1299    vst1_u64((uint64_t *)out, d7u64);
1300    out += 12;
1301    vst1_u64((uint64_t *)out, d8u64);
1302    out += 4;
1303    vst1_u64((uint64_t *)out, d9u64);
1304    out += 12;
1305    vst1_u64((uint64_t *)out, d10u64);
1306    out += 4;
1307    vst1_u64((uint64_t *)out, d11u64);
1308    out += 12;
1309    vst1_u64((uint64_t *)out, d28u64);
1310    out += 4;
1311    vst1_u64((uint64_t *)out, d29u64);
1312    out += 12;
1313    vst1_u64((uint64_t *)out, d30u64);
1314    out += 4;
1315    vst1_u64((uint64_t *)out, d31u64);
1316    return;
1317}
1318