1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12
13#include "./vpx_config.h"
14#include "vpx_dsp/txfm_common.h"
15
16#define LOAD_FROM_TRANSPOSED(prev, first, second) \
17    q14s16 = vld1q_s16(trans_buf + first * 8); \
18    q13s16 = vld1q_s16(trans_buf + second * 8);
19
20#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
21    qA = vld1q_s16(out + first * 32); \
22    qB = vld1q_s16(out + second * 32);
23
24#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
25    vst1q_s16(out + first * 32, qA); \
26    vst1q_s16(out + second * 32, qB);
27
28#define  STORE_COMBINE_CENTER_RESULTS(r10, r9) \
29       __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
30                                      q6s16, q7s16, q8s16, q9s16);
31static INLINE void __STORE_COMBINE_CENTER_RESULTS(
32        uint8_t *p1,
33        uint8_t *p2,
34        int stride,
35        int16x8_t q6s16,
36        int16x8_t q7s16,
37        int16x8_t q8s16,
38        int16x8_t q9s16) {
39    int16x4_t d8s16, d9s16, d10s16, d11s16;
40
41    d8s16 = vld1_s16((int16_t *)p1);
42    p1 += stride;
43    d11s16 = vld1_s16((int16_t *)p2);
44    p2 -= stride;
45    d9s16 = vld1_s16((int16_t *)p1);
46    d10s16 = vld1_s16((int16_t *)p2);
47
48    q7s16 = vrshrq_n_s16(q7s16, 6);
49    q8s16 = vrshrq_n_s16(q8s16, 6);
50    q9s16 = vrshrq_n_s16(q9s16, 6);
51    q6s16 = vrshrq_n_s16(q6s16, 6);
52
53    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
54                                           vreinterpret_u8_s16(d9s16)));
55    q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
56                                           vreinterpret_u8_s16(d10s16)));
57    q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
58                                           vreinterpret_u8_s16(d11s16)));
59    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
60                                           vreinterpret_u8_s16(d8s16)));
61
62    d9s16  = vreinterpret_s16_u8(vqmovun_s16(q7s16));
63    d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
64    d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
65    d8s16  = vreinterpret_s16_u8(vqmovun_s16(q6s16));
66
67    vst1_s16((int16_t *)p1, d9s16);
68    p1 -= stride;
69    vst1_s16((int16_t *)p2, d10s16);
70    p2 += stride;
71    vst1_s16((int16_t *)p1, d8s16);
72    vst1_s16((int16_t *)p2, d11s16);
73    return;
74}
75
76#define  STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
77       __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
78                                      q4s16, q5s16, q6s16, q7s16);
79static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
80        uint8_t *p1,
81        uint8_t *p2,
82        int stride,
83        int16x8_t q4s16,
84        int16x8_t q5s16,
85        int16x8_t q6s16,
86        int16x8_t q7s16) {
87    int16x4_t d4s16, d5s16, d6s16, d7s16;
88
89    d4s16 = vld1_s16((int16_t *)p1);
90    p1 += stride;
91    d7s16 = vld1_s16((int16_t *)p2);
92    p2 -= stride;
93    d5s16 = vld1_s16((int16_t *)p1);
94    d6s16 = vld1_s16((int16_t *)p2);
95
96    q5s16 = vrshrq_n_s16(q5s16, 6);
97    q6s16 = vrshrq_n_s16(q6s16, 6);
98    q7s16 = vrshrq_n_s16(q7s16, 6);
99    q4s16 = vrshrq_n_s16(q4s16, 6);
100
101    q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
102                                           vreinterpret_u8_s16(d5s16)));
103    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
104                                           vreinterpret_u8_s16(d6s16)));
105    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
106                                           vreinterpret_u8_s16(d7s16)));
107    q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
108                                           vreinterpret_u8_s16(d4s16)));
109
110    d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
111    d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
112    d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
113    d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
114
115    vst1_s16((int16_t *)p1, d5s16);
116    p1 -= stride;
117    vst1_s16((int16_t *)p2, d6s16);
118    p2 += stride;
119    vst1_s16((int16_t *)p2, d7s16);
120    vst1_s16((int16_t *)p1, d4s16);
121    return;
122}
123
124#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
125        DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
126static INLINE void DO_BUTTERFLY(
127        int16x8_t q14s16,
128        int16x8_t q13s16,
129        int16_t first_const,
130        int16_t second_const,
131        int16x8_t *qAs16,
132        int16x8_t *qBs16) {
133    int16x4_t d30s16, d31s16;
134    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
135    int16x4_t dCs16, dDs16, dAs16, dBs16;
136
137    dCs16 = vget_low_s16(q14s16);
138    dDs16 = vget_high_s16(q14s16);
139    dAs16 = vget_low_s16(q13s16);
140    dBs16 = vget_high_s16(q13s16);
141
142    d30s16 = vdup_n_s16(first_const);
143    d31s16 = vdup_n_s16(second_const);
144
145    q8s32 = vmull_s16(dCs16, d30s16);
146    q10s32 = vmull_s16(dAs16, d31s16);
147    q9s32 = vmull_s16(dDs16, d30s16);
148    q11s32 = vmull_s16(dBs16, d31s16);
149    q12s32 = vmull_s16(dCs16, d31s16);
150
151    q8s32 = vsubq_s32(q8s32, q10s32);
152    q9s32 = vsubq_s32(q9s32, q11s32);
153
154    q10s32 = vmull_s16(dDs16, d31s16);
155    q11s32 = vmull_s16(dAs16, d30s16);
156    q15s32 = vmull_s16(dBs16, d30s16);
157
158    q11s32 = vaddq_s32(q12s32, q11s32);
159    q10s32 = vaddq_s32(q10s32, q15s32);
160
161    *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
162                          vqrshrn_n_s32(q9s32, 14));
163    *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
164                          vqrshrn_n_s32(q10s32, 14));
165    return;
166}
167
168static INLINE void idct32_transpose_pair(
169        int16_t *input,
170        int16_t *t_buf) {
171    int16_t *in;
172    int i;
173    const int stride = 32;
174    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
175    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
176    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
177    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
178    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
179
180    for (i = 0; i < 4; i++, input += 8) {
181        in = input;
182        q8s16 = vld1q_s16(in);
183        in += stride;
184        q9s16 = vld1q_s16(in);
185        in += stride;
186        q10s16 = vld1q_s16(in);
187        in += stride;
188        q11s16 = vld1q_s16(in);
189        in += stride;
190        q12s16 = vld1q_s16(in);
191        in += stride;
192        q13s16 = vld1q_s16(in);
193        in += stride;
194        q14s16 = vld1q_s16(in);
195        in += stride;
196        q15s16 = vld1q_s16(in);
197
198        d16s16 = vget_low_s16(q8s16);
199        d17s16 = vget_high_s16(q8s16);
200        d18s16 = vget_low_s16(q9s16);
201        d19s16 = vget_high_s16(q9s16);
202        d20s16 = vget_low_s16(q10s16);
203        d21s16 = vget_high_s16(q10s16);
204        d22s16 = vget_low_s16(q11s16);
205        d23s16 = vget_high_s16(q11s16);
206        d24s16 = vget_low_s16(q12s16);
207        d25s16 = vget_high_s16(q12s16);
208        d26s16 = vget_low_s16(q13s16);
209        d27s16 = vget_high_s16(q13s16);
210        d28s16 = vget_low_s16(q14s16);
211        d29s16 = vget_high_s16(q14s16);
212        d30s16 = vget_low_s16(q15s16);
213        d31s16 = vget_high_s16(q15s16);
214
215        q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
216        q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
217        q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
218        q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
219        q12s16 = vcombine_s16(d17s16, d25s16);
220        q13s16 = vcombine_s16(d19s16, d27s16);
221        q14s16 = vcombine_s16(d21s16, d29s16);
222        q15s16 = vcombine_s16(d23s16, d31s16);
223
224        q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
225                            vreinterpretq_s32_s16(q10s16));
226        q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
227                            vreinterpretq_s32_s16(q11s16));
228        q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
229                            vreinterpretq_s32_s16(q14s16));
230        q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
231                            vreinterpretq_s32_s16(q15s16));
232
233        q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
234                            vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
235        q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
236                            vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
237        q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
238                            vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
239        q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
240                            vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
241
242        vst1q_s16(t_buf, q0x2s16.val[0]);
243        t_buf += 8;
244        vst1q_s16(t_buf, q0x2s16.val[1]);
245        t_buf += 8;
246        vst1q_s16(t_buf, q1x2s16.val[0]);
247        t_buf += 8;
248        vst1q_s16(t_buf, q1x2s16.val[1]);
249        t_buf += 8;
250        vst1q_s16(t_buf, q2x2s16.val[0]);
251        t_buf += 8;
252        vst1q_s16(t_buf, q2x2s16.val[1]);
253        t_buf += 8;
254        vst1q_s16(t_buf, q3x2s16.val[0]);
255        t_buf += 8;
256        vst1q_s16(t_buf, q3x2s16.val[1]);
257        t_buf += 8;
258    }
259    return;
260}
261
262static INLINE void idct32_bands_end_1st_pass(
263        int16_t *out,
264        int16x8_t q2s16,
265        int16x8_t q3s16,
266        int16x8_t q6s16,
267        int16x8_t q7s16,
268        int16x8_t q8s16,
269        int16x8_t q9s16,
270        int16x8_t q10s16,
271        int16x8_t q11s16,
272        int16x8_t q12s16,
273        int16x8_t q13s16,
274        int16x8_t q14s16,
275        int16x8_t q15s16) {
276    int16x8_t q0s16, q1s16, q4s16, q5s16;
277
278    STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
279    STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
280
281    LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
282    q4s16 = vaddq_s16(q2s16, q1s16);
283    q5s16 = vaddq_s16(q3s16, q0s16);
284    q6s16 = vsubq_s16(q3s16, q0s16);
285    q7s16 = vsubq_s16(q2s16, q1s16);
286    STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
287    STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
288
289    LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
290    q2s16 = vaddq_s16(q10s16, q1s16);
291    q3s16 = vaddq_s16(q11s16, q0s16);
292    q4s16 = vsubq_s16(q11s16, q0s16);
293    q5s16 = vsubq_s16(q10s16, q1s16);
294
295    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
296    q8s16 = vaddq_s16(q4s16, q1s16);
297    q9s16 = vaddq_s16(q5s16, q0s16);
298    q6s16 = vsubq_s16(q5s16, q0s16);
299    q7s16 = vsubq_s16(q4s16, q1s16);
300    STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
301    STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
302
303    LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
304    q4s16 = vaddq_s16(q2s16, q1s16);
305    q5s16 = vaddq_s16(q3s16, q0s16);
306    q6s16 = vsubq_s16(q3s16, q0s16);
307    q7s16 = vsubq_s16(q2s16, q1s16);
308    STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
309    STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
310
311    LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
312    q2s16 = vaddq_s16(q12s16, q1s16);
313    q3s16 = vaddq_s16(q13s16, q0s16);
314    q4s16 = vsubq_s16(q13s16, q0s16);
315    q5s16 = vsubq_s16(q12s16, q1s16);
316
317    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
318    q8s16 = vaddq_s16(q4s16, q1s16);
319    q9s16 = vaddq_s16(q5s16, q0s16);
320    q6s16 = vsubq_s16(q5s16, q0s16);
321    q7s16 = vsubq_s16(q4s16, q1s16);
322    STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
323    STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
324
325    LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
326    q4s16 = vaddq_s16(q2s16, q1s16);
327    q5s16 = vaddq_s16(q3s16, q0s16);
328    q6s16 = vsubq_s16(q3s16, q0s16);
329    q7s16 = vsubq_s16(q2s16, q1s16);
330    STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
331    STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
332
333    LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
334    q2s16 = vaddq_s16(q14s16, q1s16);
335    q3s16 = vaddq_s16(q15s16, q0s16);
336    q4s16 = vsubq_s16(q15s16, q0s16);
337    q5s16 = vsubq_s16(q14s16, q1s16);
338
339    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
340    q8s16 = vaddq_s16(q4s16, q1s16);
341    q9s16 = vaddq_s16(q5s16, q0s16);
342    q6s16 = vsubq_s16(q5s16, q0s16);
343    q7s16 = vsubq_s16(q4s16, q1s16);
344    STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
345    STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
346
347    LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
348    q4s16 = vaddq_s16(q2s16, q1s16);
349    q5s16 = vaddq_s16(q3s16, q0s16);
350    q6s16 = vsubq_s16(q3s16, q0s16);
351    q7s16 = vsubq_s16(q2s16, q1s16);
352    STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
353    STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
354    return;
355}
356
357static INLINE void idct32_bands_end_2nd_pass(
358        int16_t *out,
359        uint8_t *dest,
360        int stride,
361        int16x8_t q2s16,
362        int16x8_t q3s16,
363        int16x8_t q6s16,
364        int16x8_t q7s16,
365        int16x8_t q8s16,
366        int16x8_t q9s16,
367        int16x8_t q10s16,
368        int16x8_t q11s16,
369        int16x8_t q12s16,
370        int16x8_t q13s16,
371        int16x8_t q14s16,
372        int16x8_t q15s16) {
373    uint8_t *r6  = dest + 31 * stride;
374    uint8_t *r7  = dest/* +  0 * stride*/;
375    uint8_t *r9  = dest + 15 * stride;
376    uint8_t *r10 = dest + 16 * stride;
377    int str2 = stride << 1;
378    int16x8_t q0s16, q1s16, q4s16, q5s16;
379
380    STORE_COMBINE_CENTER_RESULTS(r10, r9);
381    r10 += str2; r9 -= str2;
382
383    LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
384    q4s16 = vaddq_s16(q2s16, q1s16);
385    q5s16 = vaddq_s16(q3s16, q0s16);
386    q6s16 = vsubq_s16(q3s16, q0s16);
387    q7s16 = vsubq_s16(q2s16, q1s16);
388    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
389    r7 += str2; r6 -= str2;
390
391    LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
392    q2s16 = vaddq_s16(q10s16, q1s16);
393    q3s16 = vaddq_s16(q11s16, q0s16);
394    q4s16 = vsubq_s16(q11s16, q0s16);
395    q5s16 = vsubq_s16(q10s16, q1s16);
396
397    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
398    q8s16 = vaddq_s16(q4s16, q1s16);
399    q9s16 = vaddq_s16(q5s16, q0s16);
400    q6s16 = vsubq_s16(q5s16, q0s16);
401    q7s16 = vsubq_s16(q4s16, q1s16);
402    STORE_COMBINE_CENTER_RESULTS(r10, r9);
403    r10 += str2; r9 -= str2;
404
405    LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
406    q4s16 = vaddq_s16(q2s16, q1s16);
407    q5s16 = vaddq_s16(q3s16, q0s16);
408    q6s16 = vsubq_s16(q3s16, q0s16);
409    q7s16 = vsubq_s16(q2s16, q1s16);
410    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
411    r7 += str2; r6 -= str2;
412
413    LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
414    q2s16 = vaddq_s16(q12s16, q1s16);
415    q3s16 = vaddq_s16(q13s16, q0s16);
416    q4s16 = vsubq_s16(q13s16, q0s16);
417    q5s16 = vsubq_s16(q12s16, q1s16);
418
419    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
420    q8s16 = vaddq_s16(q4s16, q1s16);
421    q9s16 = vaddq_s16(q5s16, q0s16);
422    q6s16 = vsubq_s16(q5s16, q0s16);
423    q7s16 = vsubq_s16(q4s16, q1s16);
424    STORE_COMBINE_CENTER_RESULTS(r10, r9);
425    r10 += str2; r9 -= str2;
426
427    LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
428    q4s16 = vaddq_s16(q2s16, q1s16);
429    q5s16 = vaddq_s16(q3s16, q0s16);
430    q6s16 = vsubq_s16(q3s16, q0s16);
431    q7s16 = vsubq_s16(q2s16, q1s16);
432    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
433    r7 += str2; r6 -= str2;
434
435    LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
436    q2s16 = vaddq_s16(q14s16, q1s16);
437    q3s16 = vaddq_s16(q15s16, q0s16);
438    q4s16 = vsubq_s16(q15s16, q0s16);
439    q5s16 = vsubq_s16(q14s16, q1s16);
440
441    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
442    q8s16 = vaddq_s16(q4s16, q1s16);
443    q9s16 = vaddq_s16(q5s16, q0s16);
444    q6s16 = vsubq_s16(q5s16, q0s16);
445    q7s16 = vsubq_s16(q4s16, q1s16);
446    STORE_COMBINE_CENTER_RESULTS(r10, r9);
447
448    LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
449    q4s16 = vaddq_s16(q2s16, q1s16);
450    q5s16 = vaddq_s16(q3s16, q0s16);
451    q6s16 = vsubq_s16(q3s16, q0s16);
452    q7s16 = vsubq_s16(q2s16, q1s16);
453    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
454    return;
455}
456
457void vpx_idct32x32_1024_add_neon(
458        int16_t *input,
459        uint8_t *dest,
460        int stride) {
461    int i, idct32_pass_loop;
462    int16_t trans_buf[32 * 8];
463    int16_t pass1[32 * 32];
464    int16_t pass2[32 * 32];
465    int16_t *out;
466    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
467    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
468
469    for (idct32_pass_loop = 0, out = pass1;
470         idct32_pass_loop < 2;
471         idct32_pass_loop++,
472         input = pass1,  // the input of pass2 is the result of pass1
473         out = pass2) {
474        for (i = 0;
475             i < 4; i++,
476             input += 32 * 8, out += 8) {  // idct32_bands_loop
477            idct32_transpose_pair(input, trans_buf);
478
479            // -----------------------------------------
480            // BLOCK A: 16-19,28-31
481            // -----------------------------------------
482            // generate 16,17,30,31
483            // part of stage 1
484            LOAD_FROM_TRANSPOSED(0, 1, 31)
485            DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
486            LOAD_FROM_TRANSPOSED(31, 17, 15)
487            DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
488            // part of stage 2
489            q4s16 = vaddq_s16(q0s16, q1s16);
490            q13s16 = vsubq_s16(q0s16, q1s16);
491            q6s16 = vaddq_s16(q2s16, q3s16);
492            q14s16 = vsubq_s16(q2s16, q3s16);
493            // part of stage 3
494            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
495
496            // generate 18,19,28,29
497            // part of stage 1
498            LOAD_FROM_TRANSPOSED(15, 9, 23)
499            DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
500            LOAD_FROM_TRANSPOSED(23, 25, 7)
501            DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
502            // part of stage 2
503            q13s16 = vsubq_s16(q3s16, q2s16);
504            q3s16 = vaddq_s16(q3s16, q2s16);
505            q14s16 = vsubq_s16(q1s16, q0s16);
506            q2s16 = vaddq_s16(q1s16, q0s16);
507            // part of stage 3
508            DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
509            // part of stage 4
510            q8s16 = vaddq_s16(q4s16, q2s16);
511            q9s16 = vaddq_s16(q5s16, q0s16);
512            q10s16 = vaddq_s16(q7s16, q1s16);
513            q15s16 = vaddq_s16(q6s16, q3s16);
514            q13s16 = vsubq_s16(q5s16, q0s16);
515            q14s16 = vsubq_s16(q7s16, q1s16);
516            STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
517            STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
518            // part of stage 5
519            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
520            STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
521            // part of stage 4
522            q13s16 = vsubq_s16(q4s16, q2s16);
523            q14s16 = vsubq_s16(q6s16, q3s16);
524            // part of stage 5
525            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
526            STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
527
528            // -----------------------------------------
529            // BLOCK B: 20-23,24-27
530            // -----------------------------------------
531            // generate 20,21,26,27
532            // part of stage 1
533            LOAD_FROM_TRANSPOSED(7, 5, 27)
534            DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
535            LOAD_FROM_TRANSPOSED(27, 21, 11)
536            DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
537            // part of stage 2
538            q13s16 = vsubq_s16(q0s16, q1s16);
539            q0s16 = vaddq_s16(q0s16, q1s16);
540            q14s16 = vsubq_s16(q2s16, q3s16);
541            q2s16 = vaddq_s16(q2s16, q3s16);
542            // part of stage 3
543            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
544
545            // generate 22,23,24,25
546            // part of stage 1
547            LOAD_FROM_TRANSPOSED(11, 13, 19)
548            DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
549            LOAD_FROM_TRANSPOSED(19, 29, 3)
550            DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
551            // part of stage 2
552            q14s16 = vsubq_s16(q4s16, q5s16);
553            q5s16  = vaddq_s16(q4s16, q5s16);
554            q13s16 = vsubq_s16(q6s16, q7s16);
555            q6s16  = vaddq_s16(q6s16, q7s16);
556            // part of stage 3
557            DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
558            // part of stage 4
559            q10s16 = vaddq_s16(q7s16, q1s16);
560            q11s16 = vaddq_s16(q5s16, q0s16);
561            q12s16 = vaddq_s16(q6s16, q2s16);
562            q15s16 = vaddq_s16(q4s16, q3s16);
563            // part of stage 6
564            LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
565            q8s16 = vaddq_s16(q14s16, q11s16);
566            q9s16 = vaddq_s16(q13s16, q10s16);
567            q13s16 = vsubq_s16(q13s16, q10s16);
568            q11s16 = vsubq_s16(q14s16, q11s16);
569            STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
570            LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
571            q8s16  = vsubq_s16(q9s16, q12s16);
572            q10s16 = vaddq_s16(q14s16, q15s16);
573            q14s16 = vsubq_s16(q14s16, q15s16);
574            q12s16 = vaddq_s16(q9s16, q12s16);
575            STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
576            // part of stage 7
577            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
578            STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
579            q13s16 = q11s16;
580            q14s16 = q8s16;
581            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
582            STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
583            // part of stage 4
584            q14s16 = vsubq_s16(q5s16, q0s16);
585            q13s16 = vsubq_s16(q6s16, q2s16);
586            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
587            q14s16 = vsubq_s16(q7s16, q1s16);
588            q13s16 = vsubq_s16(q4s16, q3s16);
589            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
590            // part of stage 6
591            LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
592            q8s16 = vaddq_s16(q14s16, q1s16);
593            q9s16 = vaddq_s16(q13s16, q6s16);
594            q13s16 = vsubq_s16(q13s16, q6s16);
595            q1s16 = vsubq_s16(q14s16, q1s16);
596            STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
597            LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
598            q14s16 = vsubq_s16(q8s16, q5s16);
599            q10s16 = vaddq_s16(q8s16, q5s16);
600            q11s16 = vaddq_s16(q9s16, q0s16);
601            q0s16 = vsubq_s16(q9s16, q0s16);
602            STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
603            // part of stage 7
604            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
605            STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
606            DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
607                                                         &q1s16, &q0s16);
608            STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
609
610            // -----------------------------------------
611            // BLOCK C: 8-10,11-15
612            // -----------------------------------------
613            // generate 8,9,14,15
614            // part of stage 2
615            LOAD_FROM_TRANSPOSED(3, 2, 30)
616            DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
617            LOAD_FROM_TRANSPOSED(30, 18, 14)
618            DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
619            // part of stage 3
620            q13s16 = vsubq_s16(q0s16, q1s16);
621            q0s16 = vaddq_s16(q0s16, q1s16);
622            q14s16 = vsubq_s16(q2s16, q3s16);
623            q2s16 = vaddq_s16(q2s16, q3s16);
624            // part of stage 4
625            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
626
627            // generate 10,11,12,13
628            // part of stage 2
629            LOAD_FROM_TRANSPOSED(14, 10, 22)
630            DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
631            LOAD_FROM_TRANSPOSED(22, 26, 6)
632            DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
633            // part of stage 3
634            q14s16 = vsubq_s16(q4s16, q5s16);
635            q5s16 = vaddq_s16(q4s16, q5s16);
636            q13s16 = vsubq_s16(q6s16, q7s16);
637            q6s16 = vaddq_s16(q6s16, q7s16);
638            // part of stage 4
639            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
640            // part of stage 5
641            q8s16 = vaddq_s16(q0s16, q5s16);
642            q9s16 = vaddq_s16(q1s16, q7s16);
643            q13s16 = vsubq_s16(q1s16, q7s16);
644            q14s16 = vsubq_s16(q3s16, q4s16);
645            q10s16 = vaddq_s16(q3s16, q4s16);
646            q15s16 = vaddq_s16(q2s16, q6s16);
647            STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
648            STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
649            // part of stage 6
650            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
651            STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
652            q13s16 = vsubq_s16(q0s16, q5s16);
653            q14s16 = vsubq_s16(q2s16, q6s16);
654            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
655            STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
656
657            // -----------------------------------------
658            // BLOCK D: 0-3,4-7
659            // -----------------------------------------
660            // generate 4,5,6,7
661            // part of stage 3
662            LOAD_FROM_TRANSPOSED(6, 4, 28)
663            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
664            LOAD_FROM_TRANSPOSED(28, 20, 12)
665            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
666            // part of stage 4
667            q13s16 = vsubq_s16(q0s16, q1s16);
668            q0s16 = vaddq_s16(q0s16, q1s16);
669            q14s16 = vsubq_s16(q2s16, q3s16);
670            q2s16 = vaddq_s16(q2s16, q3s16);
671            // part of stage 5
672            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
673
674            // generate 0,1,2,3
675            // part of stage 4
676            LOAD_FROM_TRANSPOSED(12, 0, 16)
677            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
678            LOAD_FROM_TRANSPOSED(16, 8, 24)
679            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
680            // part of stage 5
681            q4s16 = vaddq_s16(q7s16, q6s16);
682            q7s16 = vsubq_s16(q7s16, q6s16);
683            q6s16 = vsubq_s16(q5s16, q14s16);
684            q5s16 = vaddq_s16(q5s16, q14s16);
685            // part of stage 6
686            q8s16 = vaddq_s16(q4s16, q2s16);
687            q9s16 = vaddq_s16(q5s16, q3s16);
688            q10s16 = vaddq_s16(q6s16, q1s16);
689            q11s16 = vaddq_s16(q7s16, q0s16);
690            q12s16 = vsubq_s16(q7s16, q0s16);
691            q13s16 = vsubq_s16(q6s16, q1s16);
692            q14s16 = vsubq_s16(q5s16, q3s16);
693            q15s16 = vsubq_s16(q4s16, q2s16);
694            // part of stage 7
695            LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
696            q2s16 = vaddq_s16(q8s16, q1s16);
697            q3s16 = vaddq_s16(q9s16, q0s16);
698            q4s16 = vsubq_s16(q9s16, q0s16);
699            q5s16 = vsubq_s16(q8s16, q1s16);
700            LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
701            q8s16 = vaddq_s16(q4s16, q1s16);
702            q9s16 = vaddq_s16(q5s16, q0s16);
703            q6s16 = vsubq_s16(q5s16, q0s16);
704            q7s16 = vsubq_s16(q4s16, q1s16);
705
706            if (idct32_pass_loop == 0) {
707                idct32_bands_end_1st_pass(out,
708                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
709                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
710            } else {
711                idct32_bands_end_2nd_pass(out, dest, stride,
712                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
713                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
714                dest += 8;
715            }
716        }
717    }
718    return;
719}
720