10a39d0a697ff3603e8c100300fda363658e10b23James Zern/*
20a39d0a697ff3603e8c100300fda363658e10b23James Zern *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
30a39d0a697ff3603e8c100300fda363658e10b23James Zern *
40a39d0a697ff3603e8c100300fda363658e10b23James Zern *  Use of this source code is governed by a BSD-style license
50a39d0a697ff3603e8c100300fda363658e10b23James Zern *  that can be found in the LICENSE file in the root of the source
60a39d0a697ff3603e8c100300fda363658e10b23James Zern *  tree. An additional intellectual property rights grant can be found
70a39d0a697ff3603e8c100300fda363658e10b23James Zern *  in the file PATENTS.  All contributing project authors may
80a39d0a697ff3603e8c100300fda363658e10b23James Zern *  be found in the AUTHORS file in the root of the source tree.
90a39d0a697ff3603e8c100300fda363658e10b23James Zern */
100a39d0a697ff3603e8c100300fda363658e10b23James Zern
110a39d0a697ff3603e8c100300fda363658e10b23James Zern#include <arm_neon.h>
120a39d0a697ff3603e8c100300fda363658e10b23James Zern
130a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "./vpx_config.h"
140a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "./vpx_dsp_rtcd.h"
150a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/arm/idct_neon.h"
160a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/arm/transpose_neon.h"
170a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/txfm_common.h"
180a39d0a697ff3603e8c100300fda363658e10b23James Zern
190a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void load_from_transformed(const int32_t *const trans_buf,
200a39d0a697ff3603e8c100300fda363658e10b23James Zern                                         const int first, const int second,
210a39d0a697ff3603e8c100300fda363658e10b23James Zern                                         int32x4x2_t *const q0,
220a39d0a697ff3603e8c100300fda363658e10b23James Zern                                         int32x4x2_t *const q1) {
230a39d0a697ff3603e8c100300fda363658e10b23James Zern  q0->val[0] = vld1q_s32(trans_buf + first * 8);
240a39d0a697ff3603e8c100300fda363658e10b23James Zern  q0->val[1] = vld1q_s32(trans_buf + first * 8 + 4);
250a39d0a697ff3603e8c100300fda363658e10b23James Zern  q1->val[0] = vld1q_s32(trans_buf + second * 8);
260a39d0a697ff3603e8c100300fda363658e10b23James Zern  q1->val[1] = vld1q_s32(trans_buf + second * 8 + 4);
270a39d0a697ff3603e8c100300fda363658e10b23James Zern}
280a39d0a697ff3603e8c100300fda363658e10b23James Zern
290a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void load_from_output(const int32_t *const out, const int first,
300a39d0a697ff3603e8c100300fda363658e10b23James Zern                                    const int second, int32x4x2_t *const q0,
310a39d0a697ff3603e8c100300fda363658e10b23James Zern                                    int32x4x2_t *const q1) {
320a39d0a697ff3603e8c100300fda363658e10b23James Zern  q0->val[0] = vld1q_s32(out + first * 32);
330a39d0a697ff3603e8c100300fda363658e10b23James Zern  q0->val[1] = vld1q_s32(out + first * 32 + 4);
340a39d0a697ff3603e8c100300fda363658e10b23James Zern  q1->val[0] = vld1q_s32(out + second * 32);
350a39d0a697ff3603e8c100300fda363658e10b23James Zern  q1->val[1] = vld1q_s32(out + second * 32 + 4);
360a39d0a697ff3603e8c100300fda363658e10b23James Zern}
370a39d0a697ff3603e8c100300fda363658e10b23James Zern
380a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_in_output(int32_t *const out, const int first,
390a39d0a697ff3603e8c100300fda363658e10b23James Zern                                   const int second, const int32x4x2_t q0,
400a39d0a697ff3603e8c100300fda363658e10b23James Zern                                   const int32x4x2_t q1) {
410a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(out + first * 32, q0.val[0]);
420a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(out + first * 32 + 4, q0.val[1]);
430a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(out + second * 32, q1.val[0]);
440a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(out + second * 32 + 4, q1.val[1]);
450a39d0a697ff3603e8c100300fda363658e10b23James Zern}
460a39d0a697ff3603e8c100300fda363658e10b23James Zern
470a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void highbd_store_combine_results(
480a39d0a697ff3603e8c100300fda363658e10b23James Zern    uint16_t *p1, uint16_t *p2, const int stride, const int32x4x2_t q0,
490a39d0a697ff3603e8c100300fda363658e10b23James Zern    const int32x4x2_t q1, const int32x4x2_t q2, const int32x4x2_t q3,
500a39d0a697ff3603e8c100300fda363658e10b23James Zern    const int16x8_t max) {
510a39d0a697ff3603e8c100300fda363658e10b23James Zern  int16x8_t o[4];
520a39d0a697ff3603e8c100300fda363658e10b23James Zern  uint16x8_t d[4];
530a39d0a697ff3603e8c100300fda363658e10b23James Zern
540a39d0a697ff3603e8c100300fda363658e10b23James Zern  d[0] = vld1q_u16(p1);
550a39d0a697ff3603e8c100300fda363658e10b23James Zern  p1 += stride;
560a39d0a697ff3603e8c100300fda363658e10b23James Zern  d[1] = vld1q_u16(p1);
570a39d0a697ff3603e8c100300fda363658e10b23James Zern  d[3] = vld1q_u16(p2);
580a39d0a697ff3603e8c100300fda363658e10b23James Zern  p2 -= stride;
590a39d0a697ff3603e8c100300fda363658e10b23James Zern  d[2] = vld1q_u16(p2);
600a39d0a697ff3603e8c100300fda363658e10b23James Zern
610a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[0] = vcombine_s16(vrshrn_n_s32(q0.val[0], 6), vrshrn_n_s32(q0.val[1], 6));
620a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[1] = vcombine_s16(vrshrn_n_s32(q1.val[0], 6), vrshrn_n_s32(q1.val[1], 6));
630a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[2] = vcombine_s16(vrshrn_n_s32(q2.val[0], 6), vrshrn_n_s32(q2.val[1], 6));
640a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[3] = vcombine_s16(vrshrn_n_s32(q3.val[0], 6), vrshrn_n_s32(q3.val[1], 6));
650a39d0a697ff3603e8c100300fda363658e10b23James Zern
660a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[0] = vqaddq_s16(o[0], vreinterpretq_s16_u16(d[0]));
670a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[1] = vqaddq_s16(o[1], vreinterpretq_s16_u16(d[1]));
680a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[2] = vqaddq_s16(o[2], vreinterpretq_s16_u16(d[2]));
690a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[3] = vqaddq_s16(o[3], vreinterpretq_s16_u16(d[3]));
700a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[0] = vminq_s16(o[0], max);
710a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[1] = vminq_s16(o[1], max);
720a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[2] = vminq_s16(o[2], max);
730a39d0a697ff3603e8c100300fda363658e10b23James Zern  o[3] = vminq_s16(o[3], max);
740a39d0a697ff3603e8c100300fda363658e10b23James Zern  d[0] = vqshluq_n_s16(o[0], 0);
750a39d0a697ff3603e8c100300fda363658e10b23James Zern  d[1] = vqshluq_n_s16(o[1], 0);
760a39d0a697ff3603e8c100300fda363658e10b23James Zern  d[2] = vqshluq_n_s16(o[2], 0);
770a39d0a697ff3603e8c100300fda363658e10b23James Zern  d[3] = vqshluq_n_s16(o[3], 0);
780a39d0a697ff3603e8c100300fda363658e10b23James Zern
790a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_u16(p1, d[1]);
800a39d0a697ff3603e8c100300fda363658e10b23James Zern  p1 -= stride;
810a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_u16(p1, d[0]);
820a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_u16(p2, d[2]);
830a39d0a697ff3603e8c100300fda363658e10b23James Zern  p2 += stride;
840a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_u16(p2, d[3]);
850a39d0a697ff3603e8c100300fda363658e10b23James Zern}
860a39d0a697ff3603e8c100300fda363658e10b23James Zern
870a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1,
880a39d0a697ff3603e8c100300fda363658e10b23James Zern                                const int32_t first_const,
890a39d0a697ff3603e8c100300fda363658e10b23James Zern                                const int32_t second_const,
900a39d0a697ff3603e8c100300fda363658e10b23James Zern                                int32x4x2_t *const qOut0,
910a39d0a697ff3603e8c100300fda363658e10b23James Zern                                int32x4x2_t *const qOut1) {
920a39d0a697ff3603e8c100300fda363658e10b23James Zern  int64x2x2_t q[4];
930a39d0a697ff3603e8c100300fda363658e10b23James Zern  int32x2_t d[6];
940a39d0a697ff3603e8c100300fda363658e10b23James Zern
950a39d0a697ff3603e8c100300fda363658e10b23James Zern  // Note: using v{mul, mla, mls}l_n_s32 here slows down 35% with gcc 4.9.
960a39d0a697ff3603e8c100300fda363658e10b23James Zern  d[4] = vdup_n_s32(first_const);
970a39d0a697ff3603e8c100300fda363658e10b23James Zern  d[5] = vdup_n_s32(second_const);
980a39d0a697ff3603e8c100300fda363658e10b23James Zern
990a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[0].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[4]);
1000a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[0].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[4]);
1010a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[1].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[4]);
1020a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[1].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[4]);
1030a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[0].val[0] = vmlsl_s32(q[0].val[0], vget_low_s32(qIn1.val[0]), d[5]);
1040a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[0].val[1] = vmlsl_s32(q[0].val[1], vget_high_s32(qIn1.val[0]), d[5]);
1050a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[1].val[0] = vmlsl_s32(q[1].val[0], vget_low_s32(qIn1.val[1]), d[5]);
1060a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[1].val[1] = vmlsl_s32(q[1].val[1], vget_high_s32(qIn1.val[1]), d[5]);
1070a39d0a697ff3603e8c100300fda363658e10b23James Zern
1080a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[2].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[5]);
1090a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[2].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[5]);
1100a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[3].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[5]);
1110a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[3].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[5]);
1120a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[2].val[0] = vmlal_s32(q[2].val[0], vget_low_s32(qIn1.val[0]), d[4]);
1130a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[2].val[1] = vmlal_s32(q[2].val[1], vget_high_s32(qIn1.val[0]), d[4]);
1140a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[3].val[0] = vmlal_s32(q[3].val[0], vget_low_s32(qIn1.val[1]), d[4]);
1150a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[3].val[1] = vmlal_s32(q[3].val[1], vget_high_s32(qIn1.val[1]), d[4]);
1160a39d0a697ff3603e8c100300fda363658e10b23James Zern
1170a39d0a697ff3603e8c100300fda363658e10b23James Zern  qOut0->val[0] = vcombine_s32(vrshrn_n_s64(q[0].val[0], DCT_CONST_BITS),
1180a39d0a697ff3603e8c100300fda363658e10b23James Zern                               vrshrn_n_s64(q[0].val[1], DCT_CONST_BITS));
1190a39d0a697ff3603e8c100300fda363658e10b23James Zern  qOut0->val[1] = vcombine_s32(vrshrn_n_s64(q[1].val[0], DCT_CONST_BITS),
1200a39d0a697ff3603e8c100300fda363658e10b23James Zern                               vrshrn_n_s64(q[1].val[1], DCT_CONST_BITS));
1210a39d0a697ff3603e8c100300fda363658e10b23James Zern  qOut1->val[0] = vcombine_s32(vrshrn_n_s64(q[2].val[0], DCT_CONST_BITS),
1220a39d0a697ff3603e8c100300fda363658e10b23James Zern                               vrshrn_n_s64(q[2].val[1], DCT_CONST_BITS));
1230a39d0a697ff3603e8c100300fda363658e10b23James Zern  qOut1->val[1] = vcombine_s32(vrshrn_n_s64(q[3].val[0], DCT_CONST_BITS),
1240a39d0a697ff3603e8c100300fda363658e10b23James Zern                               vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS));
1250a39d0a697ff3603e8c100300fda363658e10b23James Zern}
1260a39d0a697ff3603e8c100300fda363658e10b23James Zern
1270a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void load_s32x4q_dual(
1280a39d0a697ff3603e8c100300fda363658e10b23James Zern    const int32_t *in, int32x4x2_t *const s0, int32x4x2_t *const s1,
1290a39d0a697ff3603e8c100300fda363658e10b23James Zern    int32x4x2_t *const s2, int32x4x2_t *const s3, int32x4x2_t *const s4,
1300a39d0a697ff3603e8c100300fda363658e10b23James Zern    int32x4x2_t *const s5, int32x4x2_t *const s6, int32x4x2_t *const s7) {
1310a39d0a697ff3603e8c100300fda363658e10b23James Zern  s0->val[0] = vld1q_s32(in);
1320a39d0a697ff3603e8c100300fda363658e10b23James Zern  s0->val[1] = vld1q_s32(in + 4);
1330a39d0a697ff3603e8c100300fda363658e10b23James Zern  in += 32;
1340a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1->val[0] = vld1q_s32(in);
1350a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1->val[1] = vld1q_s32(in + 4);
1360a39d0a697ff3603e8c100300fda363658e10b23James Zern  in += 32;
1370a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2->val[0] = vld1q_s32(in);
1380a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2->val[1] = vld1q_s32(in + 4);
1390a39d0a697ff3603e8c100300fda363658e10b23James Zern  in += 32;
1400a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3->val[0] = vld1q_s32(in);
1410a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3->val[1] = vld1q_s32(in + 4);
1420a39d0a697ff3603e8c100300fda363658e10b23James Zern  in += 32;
1430a39d0a697ff3603e8c100300fda363658e10b23James Zern  s4->val[0] = vld1q_s32(in);
1440a39d0a697ff3603e8c100300fda363658e10b23James Zern  s4->val[1] = vld1q_s32(in + 4);
1450a39d0a697ff3603e8c100300fda363658e10b23James Zern  in += 32;
1460a39d0a697ff3603e8c100300fda363658e10b23James Zern  s5->val[0] = vld1q_s32(in);
1470a39d0a697ff3603e8c100300fda363658e10b23James Zern  s5->val[1] = vld1q_s32(in + 4);
1480a39d0a697ff3603e8c100300fda363658e10b23James Zern  in += 32;
1490a39d0a697ff3603e8c100300fda363658e10b23James Zern  s6->val[0] = vld1q_s32(in);
1500a39d0a697ff3603e8c100300fda363658e10b23James Zern  s6->val[1] = vld1q_s32(in + 4);
1510a39d0a697ff3603e8c100300fda363658e10b23James Zern  in += 32;
1520a39d0a697ff3603e8c100300fda363658e10b23James Zern  s7->val[0] = vld1q_s32(in);
1530a39d0a697ff3603e8c100300fda363658e10b23James Zern  s7->val[1] = vld1q_s32(in + 4);
1540a39d0a697ff3603e8c100300fda363658e10b23James Zern}
1550a39d0a697ff3603e8c100300fda363658e10b23James Zern
1560a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void transpose_and_store_s32_8x8(int32x4x2_t a0, int32x4x2_t a1,
1570a39d0a697ff3603e8c100300fda363658e10b23James Zern                                               int32x4x2_t a2, int32x4x2_t a3,
1580a39d0a697ff3603e8c100300fda363658e10b23James Zern                                               int32x4x2_t a4, int32x4x2_t a5,
1590a39d0a697ff3603e8c100300fda363658e10b23James Zern                                               int32x4x2_t a6, int32x4x2_t a7,
1600a39d0a697ff3603e8c100300fda363658e10b23James Zern                                               int32_t **out) {
1610a39d0a697ff3603e8c100300fda363658e10b23James Zern  transpose_s32_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
1620a39d0a697ff3603e8c100300fda363658e10b23James Zern
1630a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a0.val[0]);
1640a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1650a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a0.val[1]);
1660a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1670a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a1.val[0]);
1680a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1690a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a1.val[1]);
1700a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1710a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a2.val[0]);
1720a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1730a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a2.val[1]);
1740a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1750a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a3.val[0]);
1760a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1770a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a3.val[1]);
1780a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1790a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a4.val[0]);
1800a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1810a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a4.val[1]);
1820a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1830a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a5.val[0]);
1840a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1850a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a5.val[1]);
1860a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1870a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a6.val[0]);
1880a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1890a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a6.val[1]);
1900a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1910a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a7.val[0]);
1920a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1930a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(*out, a7.val[1]);
1940a39d0a697ff3603e8c100300fda363658e10b23James Zern  *out += 4;
1950a39d0a697ff3603e8c100300fda363658e10b23James Zern}
1960a39d0a697ff3603e8c100300fda363658e10b23James Zern
1970a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) {
1980a39d0a697ff3603e8c100300fda363658e10b23James Zern  int i;
1990a39d0a697ff3603e8c100300fda363658e10b23James Zern  int32x4x2_t s0, s1, s2, s3, s4, s5, s6, s7;
2000a39d0a697ff3603e8c100300fda363658e10b23James Zern
2010a39d0a697ff3603e8c100300fda363658e10b23James Zern  for (i = 0; i < 4; i++, input += 8) {
2020a39d0a697ff3603e8c100300fda363658e10b23James Zern    load_s32x4q_dual(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
2030a39d0a697ff3603e8c100300fda363658e10b23James Zern    transpose_and_store_s32_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
2040a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
2050a39d0a697ff3603e8c100300fda363658e10b23James Zern}
2060a39d0a697ff3603e8c100300fda363658e10b23James Zern
2070a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void idct32_bands_end_1st_pass(int32_t *const out,
2080a39d0a697ff3603e8c100300fda363658e10b23James Zern                                             int32x4x2_t *const q) {
2090a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 16, 17, q[6], q[7]);
2100a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 14, 15, q[8], q[9]);
2110a39d0a697ff3603e8c100300fda363658e10b23James Zern
2120a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 30, 31, &q[0], &q[1]);
2130a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_add_dual(q[2], q[1]);
2140a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_add_dual(q[3], q[0]);
2150a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[3], q[0]);
2160a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[2], q[1]);
2170a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 30, 31, q[6], q[7]);
2180a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 0, 1, q[4], q[5]);
2190a39d0a697ff3603e8c100300fda363658e10b23James Zern
2200a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 12, 13, &q[0], &q[1]);
2210a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[2] = highbd_idct_add_dual(q[10], q[1]);
2220a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[3] = highbd_idct_add_dual(q[11], q[0]);
2230a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_sub_dual(q[11], q[0]);
2240a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_sub_dual(q[10], q[1]);
2250a39d0a697ff3603e8c100300fda363658e10b23James Zern
2260a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 18, 19, &q[0], &q[1]);
2270a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[8] = highbd_idct_add_dual(q[4], q[1]);
2280a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[9] = highbd_idct_add_dual(q[5], q[0]);
2290a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[5], q[0]);
2300a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[4], q[1]);
2310a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 18, 19, q[6], q[7]);
2320a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 12, 13, q[8], q[9]);
2330a39d0a697ff3603e8c100300fda363658e10b23James Zern
2340a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 28, 29, &q[0], &q[1]);
2350a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_add_dual(q[2], q[1]);
2360a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_add_dual(q[3], q[0]);
2370a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[3], q[0]);
2380a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[2], q[1]);
2390a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 28, 29, q[6], q[7]);
2400a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 2, 3, q[4], q[5]);
2410a39d0a697ff3603e8c100300fda363658e10b23James Zern
2420a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 10, 11, &q[0], &q[1]);
2430a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[2] = highbd_idct_add_dual(q[12], q[1]);
2440a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[3] = highbd_idct_add_dual(q[13], q[0]);
2450a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_sub_dual(q[13], q[0]);
2460a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_sub_dual(q[12], q[1]);
2470a39d0a697ff3603e8c100300fda363658e10b23James Zern
2480a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 20, 21, &q[0], &q[1]);
2490a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[8] = highbd_idct_add_dual(q[4], q[1]);
2500a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[9] = highbd_idct_add_dual(q[5], q[0]);
2510a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[5], q[0]);
2520a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[4], q[1]);
2530a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 20, 21, q[6], q[7]);
2540a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 10, 11, q[8], q[9]);
2550a39d0a697ff3603e8c100300fda363658e10b23James Zern
2560a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 26, 27, &q[0], &q[1]);
2570a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_add_dual(q[2], q[1]);
2580a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_add_dual(q[3], q[0]);
2590a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[3], q[0]);
2600a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[2], q[1]);
2610a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 26, 27, q[6], q[7]);
2620a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 4, 5, q[4], q[5]);
2630a39d0a697ff3603e8c100300fda363658e10b23James Zern
2640a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 8, 9, &q[0], &q[1]);
2650a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[2] = highbd_idct_add_dual(q[14], q[1]);
2660a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[3] = highbd_idct_add_dual(q[15], q[0]);
2670a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_sub_dual(q[15], q[0]);
2680a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_sub_dual(q[14], q[1]);
2690a39d0a697ff3603e8c100300fda363658e10b23James Zern
2700a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 22, 23, &q[0], &q[1]);
2710a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[8] = highbd_idct_add_dual(q[4], q[1]);
2720a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[9] = highbd_idct_add_dual(q[5], q[0]);
2730a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[5], q[0]);
2740a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[4], q[1]);
2750a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 22, 23, q[6], q[7]);
2760a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 8, 9, q[8], q[9]);
2770a39d0a697ff3603e8c100300fda363658e10b23James Zern
2780a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 24, 25, &q[0], &q[1]);
2790a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_add_dual(q[2], q[1]);
2800a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_add_dual(q[3], q[0]);
2810a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[3], q[0]);
2820a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[2], q[1]);
2830a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 24, 25, q[6], q[7]);
2840a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_in_output(out, 6, 7, q[4], q[5]);
2850a39d0a697ff3603e8c100300fda363658e10b23James Zern}
2860a39d0a697ff3603e8c100300fda363658e10b23James Zern
2870a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void idct32_bands_end_2nd_pass(const int32_t *const out,
2880a39d0a697ff3603e8c100300fda363658e10b23James Zern                                             uint16_t *const dest,
2890a39d0a697ff3603e8c100300fda363658e10b23James Zern                                             const int stride,
2900a39d0a697ff3603e8c100300fda363658e10b23James Zern                                             const int16x8_t max,
2910a39d0a697ff3603e8c100300fda363658e10b23James Zern                                             int32x4x2_t *const q) {
2920a39d0a697ff3603e8c100300fda363658e10b23James Zern  uint16_t *dest0 = dest + 0 * stride;
2930a39d0a697ff3603e8c100300fda363658e10b23James Zern  uint16_t *dest1 = dest + 31 * stride;
2940a39d0a697ff3603e8c100300fda363658e10b23James Zern  uint16_t *dest2 = dest + 16 * stride;
2950a39d0a697ff3603e8c100300fda363658e10b23James Zern  uint16_t *dest3 = dest + 15 * stride;
2960a39d0a697ff3603e8c100300fda363658e10b23James Zern  const int str2 = stride << 1;
2970a39d0a697ff3603e8c100300fda363658e10b23James Zern
2980a39d0a697ff3603e8c100300fda363658e10b23James Zern  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
2990a39d0a697ff3603e8c100300fda363658e10b23James Zern                               max);
3000a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest2 += str2;
3010a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest3 -= str2;
3020a39d0a697ff3603e8c100300fda363658e10b23James Zern
3030a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 30, 31, &q[0], &q[1]);
3040a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_add_dual(q[2], q[1]);
3050a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_add_dual(q[3], q[0]);
3060a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[3], q[0]);
3070a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[2], q[1]);
3080a39d0a697ff3603e8c100300fda363658e10b23James Zern  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
3090a39d0a697ff3603e8c100300fda363658e10b23James Zern                               max);
3100a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest0 += str2;
3110a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest1 -= str2;
3120a39d0a697ff3603e8c100300fda363658e10b23James Zern
3130a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 12, 13, &q[0], &q[1]);
3140a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[2] = highbd_idct_add_dual(q[10], q[1]);
3150a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[3] = highbd_idct_add_dual(q[11], q[0]);
3160a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_sub_dual(q[11], q[0]);
3170a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_sub_dual(q[10], q[1]);
3180a39d0a697ff3603e8c100300fda363658e10b23James Zern
3190a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 18, 19, &q[0], &q[1]);
3200a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[8] = highbd_idct_add_dual(q[4], q[1]);
3210a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[9] = highbd_idct_add_dual(q[5], q[0]);
3220a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[5], q[0]);
3230a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[4], q[1]);
3240a39d0a697ff3603e8c100300fda363658e10b23James Zern  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
3250a39d0a697ff3603e8c100300fda363658e10b23James Zern                               max);
3260a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest2 += str2;
3270a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest3 -= str2;
3280a39d0a697ff3603e8c100300fda363658e10b23James Zern
3290a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 28, 29, &q[0], &q[1]);
3300a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_add_dual(q[2], q[1]);
3310a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_add_dual(q[3], q[0]);
3320a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[3], q[0]);
3330a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[2], q[1]);
3340a39d0a697ff3603e8c100300fda363658e10b23James Zern  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
3350a39d0a697ff3603e8c100300fda363658e10b23James Zern                               max);
3360a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest0 += str2;
3370a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest1 -= str2;
3380a39d0a697ff3603e8c100300fda363658e10b23James Zern
3390a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 10, 11, &q[0], &q[1]);
3400a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[2] = highbd_idct_add_dual(q[12], q[1]);
3410a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[3] = highbd_idct_add_dual(q[13], q[0]);
3420a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_sub_dual(q[13], q[0]);
3430a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_sub_dual(q[12], q[1]);
3440a39d0a697ff3603e8c100300fda363658e10b23James Zern
3450a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 20, 21, &q[0], &q[1]);
3460a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[8] = highbd_idct_add_dual(q[4], q[1]);
3470a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[9] = highbd_idct_add_dual(q[5], q[0]);
3480a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[5], q[0]);
3490a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[4], q[1]);
3500a39d0a697ff3603e8c100300fda363658e10b23James Zern  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
3510a39d0a697ff3603e8c100300fda363658e10b23James Zern                               max);
3520a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest2 += str2;
3530a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest3 -= str2;
3540a39d0a697ff3603e8c100300fda363658e10b23James Zern
3550a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 26, 27, &q[0], &q[1]);
3560a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_add_dual(q[2], q[1]);
3570a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_add_dual(q[3], q[0]);
3580a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[3], q[0]);
3590a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[2], q[1]);
3600a39d0a697ff3603e8c100300fda363658e10b23James Zern  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
3610a39d0a697ff3603e8c100300fda363658e10b23James Zern                               max);
3620a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest0 += str2;
3630a39d0a697ff3603e8c100300fda363658e10b23James Zern  dest1 -= str2;
3640a39d0a697ff3603e8c100300fda363658e10b23James Zern
3650a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 8, 9, &q[0], &q[1]);
3660a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[2] = highbd_idct_add_dual(q[14], q[1]);
3670a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[3] = highbd_idct_add_dual(q[15], q[0]);
3680a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_sub_dual(q[15], q[0]);
3690a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_sub_dual(q[14], q[1]);
3700a39d0a697ff3603e8c100300fda363658e10b23James Zern
3710a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 22, 23, &q[0], &q[1]);
3720a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[8] = highbd_idct_add_dual(q[4], q[1]);
3730a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[9] = highbd_idct_add_dual(q[5], q[0]);
3740a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[5], q[0]);
3750a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[4], q[1]);
3760a39d0a697ff3603e8c100300fda363658e10b23James Zern  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
3770a39d0a697ff3603e8c100300fda363658e10b23James Zern                               max);
3780a39d0a697ff3603e8c100300fda363658e10b23James Zern
3790a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_from_output(out, 24, 25, &q[0], &q[1]);
3800a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[4] = highbd_idct_add_dual(q[2], q[1]);
3810a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[5] = highbd_idct_add_dual(q[3], q[0]);
3820a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[6] = highbd_idct_sub_dual(q[3], q[0]);
3830a39d0a697ff3603e8c100300fda363658e10b23James Zern  q[7] = highbd_idct_sub_dual(q[2], q[1]);
3840a39d0a697ff3603e8c100300fda363658e10b23James Zern  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
3850a39d0a697ff3603e8c100300fda363658e10b23James Zern                               max);
3860a39d0a697ff3603e8c100300fda363658e10b23James Zern}
3870a39d0a697ff3603e8c100300fda363658e10b23James Zern
3880a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
3890a39d0a697ff3603e8c100300fda363658e10b23James Zern                                             uint16_t *dst, const int stride,
3900a39d0a697ff3603e8c100300fda363658e10b23James Zern                                             const int bd) {
3910a39d0a697ff3603e8c100300fda363658e10b23James Zern  int i, idct32_pass_loop;
3920a39d0a697ff3603e8c100300fda363658e10b23James Zern  int32_t trans_buf[32 * 8];
3930a39d0a697ff3603e8c100300fda363658e10b23James Zern  int32_t pass1[32 * 32];
3940a39d0a697ff3603e8c100300fda363658e10b23James Zern  int32_t pass2[32 * 32];
3950a39d0a697ff3603e8c100300fda363658e10b23James Zern  int32_t *out;
3960a39d0a697ff3603e8c100300fda363658e10b23James Zern  int32x4x2_t q[16];
3970a39d0a697ff3603e8c100300fda363658e10b23James Zern
3980a39d0a697ff3603e8c100300fda363658e10b23James Zern  for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
3990a39d0a697ff3603e8c100300fda363658e10b23James Zern       idct32_pass_loop++, input = pass1, out = pass2) {
4000a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < 4; i++, out += 8) {  // idct32_bands_loop
4010a39d0a697ff3603e8c100300fda363658e10b23James Zern      idct32_transpose_pair(input, trans_buf);
4020a39d0a697ff3603e8c100300fda363658e10b23James Zern      input += 32 * 8;
4030a39d0a697ff3603e8c100300fda363658e10b23James Zern
4040a39d0a697ff3603e8c100300fda363658e10b23James Zern      // -----------------------------------------
4050a39d0a697ff3603e8c100300fda363658e10b23James Zern      // BLOCK A: 16-19,28-31
4060a39d0a697ff3603e8c100300fda363658e10b23James Zern      // -----------------------------------------
4070a39d0a697ff3603e8c100300fda363658e10b23James Zern      // generate 16,17,30,31
4080a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 1
4090a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]);
4100a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]);
4110a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]);
4120a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]);
4130a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 2
4140a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[4] = highbd_idct_add_dual(q[0], q[1]);
4150a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[0], q[1]);
4160a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[6] = highbd_idct_add_dual(q[2], q[3]);
4170a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[2], q[3]);
4180a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 3
4190a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]);
4200a39d0a697ff3603e8c100300fda363658e10b23James Zern
4210a39d0a697ff3603e8c100300fda363658e10b23James Zern      // generate 18,19,28,29
4220a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 1
4230a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]);
4240a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]);
4250a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]);
4260a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]);
4270a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 2
4280a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[3], q[2]);
4290a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[3] = highbd_idct_add_dual(q[3], q[2]);
4300a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[1], q[0]);
4310a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[2] = highbd_idct_add_dual(q[1], q[0]);
4320a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 3
4330a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]);
4340a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 4
4350a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[8] = highbd_idct_add_dual(q[4], q[2]);
4360a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[9] = highbd_idct_add_dual(q[5], q[0]);
4370a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[10] = highbd_idct_add_dual(q[7], q[1]);
4380a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[15] = highbd_idct_add_dual(q[6], q[3]);
4390a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[5], q[0]);
4400a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[7], q[1]);
4410a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 16, 31, q[8], q[15]);
4420a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 17, 30, q[9], q[10]);
4430a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 5
4440a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]);
4450a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 29, 18, q[1], q[0]);
4460a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 4
4470a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[4], q[2]);
4480a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[6], q[3]);
4490a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 5
4500a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]);
4510a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 19, 28, q[4], q[6]);
4520a39d0a697ff3603e8c100300fda363658e10b23James Zern
4530a39d0a697ff3603e8c100300fda363658e10b23James Zern      // -----------------------------------------
4540a39d0a697ff3603e8c100300fda363658e10b23James Zern      // BLOCK B: 20-23,24-27
4550a39d0a697ff3603e8c100300fda363658e10b23James Zern      // -----------------------------------------
4560a39d0a697ff3603e8c100300fda363658e10b23James Zern      // generate 20,21,26,27
4570a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 1
4580a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]);
4590a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]);
4600a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]);
4610a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]);
4620a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 2
4630a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[0], q[1]);
4640a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[0] = highbd_idct_add_dual(q[0], q[1]);
4650a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[2], q[3]);
4660a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[2] = highbd_idct_add_dual(q[2], q[3]);
4670a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 3
4680a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
4690a39d0a697ff3603e8c100300fda363658e10b23James Zern
4700a39d0a697ff3603e8c100300fda363658e10b23James Zern      // generate 22,23,24,25
4710a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 1
4720a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]);
4730a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]);
4740a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]);
4750a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]);
4760a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 2
4770a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[4], q[5]);
4780a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[5] = highbd_idct_add_dual(q[4], q[5]);
4790a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[6], q[7]);
4800a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[6] = highbd_idct_add_dual(q[6], q[7]);
4810a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 3
4820a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]);
4830a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 4
4840a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[10] = highbd_idct_add_dual(q[7], q[1]);
4850a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[11] = highbd_idct_add_dual(q[5], q[0]);
4860a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[12] = highbd_idct_add_dual(q[6], q[2]);
4870a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[15] = highbd_idct_add_dual(q[4], q[3]);
4880a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 6
4890a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_output(out, 16, 17, &q[14], &q[13]);
4900a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[8] = highbd_idct_add_dual(q[14], q[11]);
4910a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[9] = highbd_idct_add_dual(q[13], q[10]);
4920a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[13], q[10]);
4930a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[11] = highbd_idct_sub_dual(q[14], q[11]);
4940a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 17, 16, q[9], q[8]);
4950a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_output(out, 30, 31, &q[14], &q[9]);
4960a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[8] = highbd_idct_sub_dual(q[9], q[12]);
4970a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[10] = highbd_idct_add_dual(q[14], q[15]);
4980a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[14], q[15]);
4990a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[12] = highbd_idct_add_dual(q[9], q[12]);
5000a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 30, 31, q[10], q[12]);
5010a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 7
5020a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
5030a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 25, 22, q[14], q[13]);
5040a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]);
5050a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 24, 23, q[14], q[13]);
5060a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 4
5070a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[5], q[0]);
5080a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[6], q[2]);
5090a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]);
5100a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[7], q[1]);
5110a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[4], q[3]);
5120a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]);
5130a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 6
5140a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_output(out, 18, 19, &q[14], &q[13]);
5150a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[8] = highbd_idct_add_dual(q[14], q[1]);
5160a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[9] = highbd_idct_add_dual(q[13], q[6]);
5170a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[13], q[6]);
5180a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[1] = highbd_idct_sub_dual(q[14], q[1]);
5190a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 18, 19, q[8], q[9]);
5200a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_output(out, 28, 29, &q[8], &q[9]);
5210a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[8], q[5]);
5220a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[10] = highbd_idct_add_dual(q[8], q[5]);
5230a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[11] = highbd_idct_add_dual(q[9], q[0]);
5240a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[0] = highbd_idct_sub_dual(q[9], q[0]);
5250a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 28, 29, q[10], q[11]);
5260a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 7
5270a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
5280a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 20, 27, q[13], q[14]);
5290a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]);
5300a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 21, 26, q[1], q[0]);
5310a39d0a697ff3603e8c100300fda363658e10b23James Zern
5320a39d0a697ff3603e8c100300fda363658e10b23James Zern      // -----------------------------------------
5330a39d0a697ff3603e8c100300fda363658e10b23James Zern      // BLOCK C: 8-10,11-15
5340a39d0a697ff3603e8c100300fda363658e10b23James Zern      // -----------------------------------------
5350a39d0a697ff3603e8c100300fda363658e10b23James Zern      // generate 8,9,14,15
5360a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 2
5370a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]);
5380a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]);
5390a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]);
5400a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]);
5410a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 3
5420a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[0], q[1]);
5430a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[0] = highbd_idct_add_dual(q[0], q[1]);
5440a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[2], q[3]);
5450a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[2] = highbd_idct_add_dual(q[2], q[3]);
5460a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 4
5470a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]);
5480a39d0a697ff3603e8c100300fda363658e10b23James Zern
5490a39d0a697ff3603e8c100300fda363658e10b23James Zern      // generate 10,11,12,13
5500a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 2
5510a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]);
5520a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]);
5530a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]);
5540a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]);
5550a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 3
5560a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[4], q[5]);
5570a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[5] = highbd_idct_add_dual(q[4], q[5]);
5580a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[6], q[7]);
5590a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[6] = highbd_idct_add_dual(q[6], q[7]);
5600a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 4
5610a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]);
5620a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 5
5630a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[8] = highbd_idct_add_dual(q[0], q[5]);
5640a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[9] = highbd_idct_add_dual(q[1], q[7]);
5650a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[1], q[7]);
5660a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[3], q[4]);
5670a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[10] = highbd_idct_add_dual(q[3], q[4]);
5680a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[15] = highbd_idct_add_dual(q[2], q[6]);
5690a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 8, 15, q[8], q[15]);
5700a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 9, 14, q[9], q[10]);
5710a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 6
5720a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
5730a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 13, 10, q[3], q[1]);
5740a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[0], q[5]);
5750a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[2], q[6]);
5760a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
5770a39d0a697ff3603e8c100300fda363658e10b23James Zern      store_in_output(out, 11, 12, q[1], q[3]);
5780a39d0a697ff3603e8c100300fda363658e10b23James Zern
5790a39d0a697ff3603e8c100300fda363658e10b23James Zern      // -----------------------------------------
5800a39d0a697ff3603e8c100300fda363658e10b23James Zern      // BLOCK D: 0-3,4-7
5810a39d0a697ff3603e8c100300fda363658e10b23James Zern      // -----------------------------------------
5820a39d0a697ff3603e8c100300fda363658e10b23James Zern      // generate 4,5,6,7
5830a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 3
5840a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]);
5850a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]);
5860a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]);
5870a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
5880a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 4
5890a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[0], q[1]);
5900a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[0] = highbd_idct_add_dual(q[0], q[1]);
5910a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[2], q[3]);
5920a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[2] = highbd_idct_add_dual(q[2], q[3]);
5930a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 5
5940a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
5950a39d0a697ff3603e8c100300fda363658e10b23James Zern
5960a39d0a697ff3603e8c100300fda363658e10b23James Zern      // generate 0,1,2,3
5970a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 4
5980a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]);
5990a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]);
6000a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]);
6010a39d0a697ff3603e8c100300fda363658e10b23James Zern      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]);
6020a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 5
6030a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[4] = highbd_idct_add_dual(q[7], q[6]);
6040a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[7] = highbd_idct_sub_dual(q[7], q[6]);
6050a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[6] = highbd_idct_sub_dual(q[5], q[14]);
6060a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[5] = highbd_idct_add_dual(q[5], q[14]);
6070a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 6
6080a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[8] = highbd_idct_add_dual(q[4], q[2]);
6090a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[9] = highbd_idct_add_dual(q[5], q[3]);
6100a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[10] = highbd_idct_add_dual(q[6], q[1]);
6110a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[11] = highbd_idct_add_dual(q[7], q[0]);
6120a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[12] = highbd_idct_sub_dual(q[7], q[0]);
6130a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[13] = highbd_idct_sub_dual(q[6], q[1]);
6140a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[14] = highbd_idct_sub_dual(q[5], q[3]);
6150a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[15] = highbd_idct_sub_dual(q[4], q[2]);
6160a39d0a697ff3603e8c100300fda363658e10b23James Zern      // part of stage 7
6170a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_output(out, 14, 15, &q[0], &q[1]);
6180a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[2] = highbd_idct_add_dual(q[8], q[1]);
6190a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[3] = highbd_idct_add_dual(q[9], q[0]);
6200a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[4] = highbd_idct_sub_dual(q[9], q[0]);
6210a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[5] = highbd_idct_sub_dual(q[8], q[1]);
6220a39d0a697ff3603e8c100300fda363658e10b23James Zern      load_from_output(out, 16, 17, &q[0], &q[1]);
6230a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[8] = highbd_idct_add_dual(q[4], q[1]);
6240a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[9] = highbd_idct_add_dual(q[5], q[0]);
6250a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[6] = highbd_idct_sub_dual(q[5], q[0]);
6260a39d0a697ff3603e8c100300fda363658e10b23James Zern      q[7] = highbd_idct_sub_dual(q[4], q[1]);
6270a39d0a697ff3603e8c100300fda363658e10b23James Zern
6280a39d0a697ff3603e8c100300fda363658e10b23James Zern      if (idct32_pass_loop == 0) {
6290a39d0a697ff3603e8c100300fda363658e10b23James Zern        idct32_bands_end_1st_pass(out, q);
6300a39d0a697ff3603e8c100300fda363658e10b23James Zern      } else {
6310a39d0a697ff3603e8c100300fda363658e10b23James Zern        const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
6320a39d0a697ff3603e8c100300fda363658e10b23James Zern        idct32_bands_end_2nd_pass(out, dst, stride, max, q);
6330a39d0a697ff3603e8c100300fda363658e10b23James Zern        dst += 8;
6340a39d0a697ff3603e8c100300fda363658e10b23James Zern      }
6350a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
6360a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
6370a39d0a697ff3603e8c100300fda363658e10b23James Zern}
6380a39d0a697ff3603e8c100300fda363658e10b23James Zern
6390a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest,
6400a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        int stride, int bd) {
6410a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (bd == 8) {
6420a39d0a697ff3603e8c100300fda363658e10b23James Zern    vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1);
6430a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {
6440a39d0a697ff3603e8c100300fda363658e10b23James Zern    vpx_highbd_idct32_32_neon(input, dest, stride, bd);
6450a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
6460a39d0a697ff3603e8c100300fda363658e10b23James Zern}
647