10a39d0a697ff3603e8c100300fda363658e10b23James Zern/*
20a39d0a697ff3603e8c100300fda363658e10b23James Zern *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
30a39d0a697ff3603e8c100300fda363658e10b23James Zern *
40a39d0a697ff3603e8c100300fda363658e10b23James Zern *  Use of this source code is governed by a BSD-style license
50a39d0a697ff3603e8c100300fda363658e10b23James Zern *  that can be found in the LICENSE file in the root of the source
60a39d0a697ff3603e8c100300fda363658e10b23James Zern *  tree. An additional intellectual property rights grant can be found
70a39d0a697ff3603e8c100300fda363658e10b23James Zern *  in the file PATENTS.  All contributing project authors may
80a39d0a697ff3603e8c100300fda363658e10b23James Zern *  be found in the AUTHORS file in the root of the source tree.
90a39d0a697ff3603e8c100300fda363658e10b23James Zern */
100a39d0a697ff3603e8c100300fda363658e10b23James Zern
110a39d0a697ff3603e8c100300fda363658e10b23James Zern#include <arm_neon.h>
120a39d0a697ff3603e8c100300fda363658e10b23James Zern
130a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "./vpx_config.h"
140a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "./vpx_dsp_rtcd.h"
150a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/arm/idct_neon.h"
160a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/arm/transpose_neon.h"
170a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/txfm_common.h"
180a39d0a697ff3603e8c100300fda363658e10b23James Zern
190a39d0a697ff3603e8c100300fda363658e10b23James Zern// Only for the first pass of the  _34_ variant. Since it only uses values from
200a39d0a697ff3603e8c100300fda363658e10b23James Zern// the top left 8x8 it can safely assume all the remaining values are 0 and skip
210a39d0a697ff3603e8c100300fda363658e10b23James Zern// an awful lot of calculations. In fact, only the first 6 columns make the cut.
220a39d0a697ff3603e8c100300fda363658e10b23James Zern// None of the elements in the 7th or 8th column are used so it skips any calls
230a39d0a697ff3603e8c100300fda363658e10b23James Zern// to input[67] too.
240a39d0a697ff3603e8c100300fda363658e10b23James Zern// In C this does a single row of 32 for each call. Here it transposes the top
250a39d0a697ff3603e8c100300fda363658e10b23James Zern// left 8x8 to allow using SIMD.
260a39d0a697ff3603e8c100300fda363658e10b23James Zern
270a39d0a697ff3603e8c100300fda363658e10b23James Zern// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
280a39d0a697ff3603e8c100300fda363658e10b23James Zern// coefficients as follows:
290a39d0a697ff3603e8c100300fda363658e10b23James Zern//    0  1  2  3  4  5  6  7
300a39d0a697ff3603e8c100300fda363658e10b23James Zern// 0  0  2  5 10 17 25
310a39d0a697ff3603e8c100300fda363658e10b23James Zern// 1  1  4  8 15 22 30
320a39d0a697ff3603e8c100300fda363658e10b23James Zern// 2  3  7 12 18 28
330a39d0a697ff3603e8c100300fda363658e10b23James Zern// 3  6 11 16 23 31
340a39d0a697ff3603e8c100300fda363658e10b23James Zern// 4  9 14 19 29
350a39d0a697ff3603e8c100300fda363658e10b23James Zern// 5 13 20 26
360a39d0a697ff3603e8c100300fda363658e10b23James Zern// 6 21 27 33
370a39d0a697ff3603e8c100300fda363658e10b23James Zern// 7 24 32
380a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_idct32_6_neon(const tran_low_t *input, int32_t *output) {
390a39d0a697ff3603e8c100300fda363658e10b23James Zern  int32x4x2_t in[8], s1[32], s2[32], s3[32];
400a39d0a697ff3603e8c100300fda363658e10b23James Zern
410a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[0].val[0] = vld1q_s32(input);
420a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[0].val[1] = vld1q_s32(input + 4);
430a39d0a697ff3603e8c100300fda363658e10b23James Zern  input += 32;
440a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[1].val[0] = vld1q_s32(input);
450a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[1].val[1] = vld1q_s32(input + 4);
460a39d0a697ff3603e8c100300fda363658e10b23James Zern  input += 32;
470a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[2].val[0] = vld1q_s32(input);
480a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[2].val[1] = vld1q_s32(input + 4);
490a39d0a697ff3603e8c100300fda363658e10b23James Zern  input += 32;
500a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[3].val[0] = vld1q_s32(input);
510a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[3].val[1] = vld1q_s32(input + 4);
520a39d0a697ff3603e8c100300fda363658e10b23James Zern  input += 32;
530a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[4].val[0] = vld1q_s32(input);
540a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[4].val[1] = vld1q_s32(input + 4);
550a39d0a697ff3603e8c100300fda363658e10b23James Zern  input += 32;
560a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[5].val[0] = vld1q_s32(input);
570a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[5].val[1] = vld1q_s32(input + 4);
580a39d0a697ff3603e8c100300fda363658e10b23James Zern  input += 32;
590a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[6].val[0] = vld1q_s32(input);
600a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[6].val[1] = vld1q_s32(input + 4);
610a39d0a697ff3603e8c100300fda363658e10b23James Zern  input += 32;
620a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[7].val[0] = vld1q_s32(input);
630a39d0a697ff3603e8c100300fda363658e10b23James Zern  in[7].val[1] = vld1q_s32(input + 4);
640a39d0a697ff3603e8c100300fda363658e10b23James Zern  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
650a39d0a697ff3603e8c100300fda363658e10b23James Zern                    &in[7]);
660a39d0a697ff3603e8c100300fda363658e10b23James Zern
670a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 1
680a39d0a697ff3603e8c100300fda363658e10b23James Zern  // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
690a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
700a39d0a697ff3603e8c100300fda363658e10b23James Zern  // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
710a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
720a39d0a697ff3603e8c100300fda363658e10b23James Zern
730a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
740a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
750a39d0a697ff3603e8c100300fda363658e10b23James Zern
760a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
770a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
780a39d0a697ff3603e8c100300fda363658e10b23James Zern
790a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 2
800a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
810a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
820a39d0a697ff3603e8c100300fda363658e10b23James Zern
830a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 3
840a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
850a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
860a39d0a697ff3603e8c100300fda363658e10b23James Zern
870a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
880a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[31], cospi_28_64);
890a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
900a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[31], cospi_4_64);
910a39d0a697ff3603e8c100300fda363658e10b23James Zern
920a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
930a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[27], cospi_12_64);
940a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
950a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[27], cospi_20_64);
960a39d0a697ff3603e8c100300fda363658e10b23James Zern
970a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
980a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[24], -cospi_20_64);
990a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
1000a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[24], cospi_12_64);
1010a39d0a697ff3603e8c100300fda363658e10b23James Zern
1020a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 4
1030a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
1040a39d0a697ff3603e8c100300fda363658e10b23James Zern
1050a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
1060a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                        s2[15], cospi_24_64);
1070a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
1080a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[15], cospi_8_64);
1090a39d0a697ff3603e8c100300fda363658e10b23James Zern
1100a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
1110a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
1120a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
1130a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
1140a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
1150a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
1160a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
1170a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
1180a39d0a697ff3603e8c100300fda363658e10b23James Zern
1190a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 5
1200a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
1210a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
1220a39d0a697ff3603e8c100300fda363658e10b23James Zern
1230a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], -cospi_8_64,
1240a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[30], cospi_24_64);
1250a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], cospi_24_64,
1260a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[30], cospi_8_64);
1270a39d0a697ff3603e8c100300fda363658e10b23James Zern
1280a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_8_64,
1290a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[31], cospi_24_64);
1300a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_24_64,
1310a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[31], cospi_8_64);
1320a39d0a697ff3603e8c100300fda363658e10b23James Zern
1330a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
1340a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[27], -cospi_8_64);
1350a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
1360a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[27], cospi_24_64);
1370a39d0a697ff3603e8c100300fda363658e10b23James Zern
1380a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
1390a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[26], -cospi_8_64);
1400a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
1410a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[26], cospi_24_64);
1420a39d0a697ff3603e8c100300fda363658e10b23James Zern
1430a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 6
1440a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
1450a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
1460a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
1470a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
1480a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
1490a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
1500a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
1510a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
1520a39d0a697ff3603e8c100300fda363658e10b23James Zern
1530a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[10] = sub_multiply_shift_and_narrow_s32_dual(s2[14], s2[9], cospi_16_64);
1540a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[13] = add_multiply_shift_and_narrow_s32_dual(s2[9], s2[14], cospi_16_64);
1550a39d0a697ff3603e8c100300fda363658e10b23James Zern
1560a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[11] = sub_multiply_shift_and_narrow_s32_dual(s2[15], s2[8], cospi_16_64);
1570a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[12] = add_multiply_shift_and_narrow_s32_dual(s2[8], s2[15], cospi_16_64);
1580a39d0a697ff3603e8c100300fda363658e10b23James Zern
1590a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[16] = highbd_idct_add_dual(s1[16], s2[23]);
1600a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[17] = highbd_idct_add_dual(s1[17], s2[22]);
1610a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
1620a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
1630a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
1640a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
1650a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[22] = highbd_idct_sub_dual(s1[17], s2[22]);
1660a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[23] = highbd_idct_sub_dual(s1[16], s2[23]);
1670a39d0a697ff3603e8c100300fda363658e10b23James Zern
1680a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[24] = highbd_idct_sub_dual(s1[31], s2[24]);
1690a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[25] = highbd_idct_sub_dual(s1[30], s2[25]);
1700a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
1710a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
1720a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
1730a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
1740a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[30] = highbd_idct_add_dual(s2[25], s1[30]);
1750a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[31] = highbd_idct_add_dual(s2[24], s1[31]);
1760a39d0a697ff3603e8c100300fda363658e10b23James Zern
1770a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 7
1780a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[0] = highbd_idct_add_dual(s2[0], s2[15]);
1790a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[1] = highbd_idct_add_dual(s2[1], s2[14]);
1800a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
1810a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
1820a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
1830a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
1840a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[6] = highbd_idct_add_dual(s2[6], s2[9]);
1850a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[7] = highbd_idct_add_dual(s2[7], s2[8]);
1860a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[8] = highbd_idct_sub_dual(s2[7], s2[8]);
1870a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[9] = highbd_idct_sub_dual(s2[6], s2[9]);
1880a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
1890a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
1900a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
1910a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
1920a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[14] = highbd_idct_sub_dual(s2[1], s2[14]);
1930a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[15] = highbd_idct_sub_dual(s2[0], s2[15]);
1940a39d0a697ff3603e8c100300fda363658e10b23James Zern
1950a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
1960a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
1970a39d0a697ff3603e8c100300fda363658e10b23James Zern
1980a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
1990a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
2000a39d0a697ff3603e8c100300fda363658e10b23James Zern
2010a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s2[22], cospi_16_64);
2020a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[25] = add_multiply_shift_and_narrow_s32_dual(s2[22], s3[25], cospi_16_64);
2030a39d0a697ff3603e8c100300fda363658e10b23James Zern
2040a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s2[23], cospi_16_64);
2050a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[24] = add_multiply_shift_and_narrow_s32_dual(s2[23], s3[24], cospi_16_64);
2060a39d0a697ff3603e8c100300fda363658e10b23James Zern
2070a39d0a697ff3603e8c100300fda363658e10b23James Zern  // final stage
2080a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[0] = highbd_idct_add_dual(s1[0], s2[31]);
2090a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[1] = highbd_idct_add_dual(s1[1], s2[30]);
2100a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[2] = highbd_idct_add_dual(s1[2], s2[29]);
2110a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[3] = highbd_idct_add_dual(s1[3], s2[28]);
2120a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[4] = highbd_idct_add_dual(s1[4], s1[27]);
2130a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[5] = highbd_idct_add_dual(s1[5], s1[26]);
2140a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[6] = highbd_idct_add_dual(s1[6], s1[25]);
2150a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[7] = highbd_idct_add_dual(s1[7], s1[24]);
2160a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[8] = highbd_idct_add_dual(s1[8], s1[23]);
2170a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[9] = highbd_idct_add_dual(s1[9], s1[22]);
2180a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[10] = highbd_idct_add_dual(s1[10], s1[21]);
2190a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[11] = highbd_idct_add_dual(s1[11], s1[20]);
2200a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[12] = highbd_idct_add_dual(s1[12], s2[19]);
2210a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[13] = highbd_idct_add_dual(s1[13], s2[18]);
2220a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[14] = highbd_idct_add_dual(s1[14], s2[17]);
2230a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[15] = highbd_idct_add_dual(s1[15], s2[16]);
2240a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[16] = highbd_idct_sub_dual(s1[15], s2[16]);
2250a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[17] = highbd_idct_sub_dual(s1[14], s2[17]);
2260a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[18] = highbd_idct_sub_dual(s1[13], s2[18]);
2270a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[19] = highbd_idct_sub_dual(s1[12], s2[19]);
2280a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[20] = highbd_idct_sub_dual(s1[11], s1[20]);
2290a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[21] = highbd_idct_sub_dual(s1[10], s1[21]);
2300a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[22] = highbd_idct_sub_dual(s1[9], s1[22]);
2310a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[23] = highbd_idct_sub_dual(s1[8], s1[23]);
2320a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[24] = highbd_idct_sub_dual(s1[7], s1[24]);
2330a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[25] = highbd_idct_sub_dual(s1[6], s1[25]);
2340a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[26] = highbd_idct_sub_dual(s1[5], s1[26]);
2350a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[27] = highbd_idct_sub_dual(s1[4], s1[27]);
2360a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[28] = highbd_idct_sub_dual(s1[3], s2[28]);
2370a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[29] = highbd_idct_sub_dual(s1[2], s2[29]);
2380a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[30] = highbd_idct_sub_dual(s1[1], s2[30]);
2390a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[31] = highbd_idct_sub_dual(s1[0], s2[31]);
2400a39d0a697ff3603e8c100300fda363658e10b23James Zern
2410a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[0].val[0]);
2420a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2430a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[0].val[1]);
2440a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2450a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[1].val[0]);
2460a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2470a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[1].val[1]);
2480a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2490a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[2].val[0]);
2500a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2510a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[2].val[1]);
2520a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2530a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[3].val[0]);
2540a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2550a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[3].val[1]);
2560a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2570a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[4].val[0]);
2580a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2590a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[4].val[1]);
2600a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2610a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[5].val[0]);
2620a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2630a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[5].val[1]);
2640a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2650a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[6].val[0]);
2660a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2670a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[6].val[1]);
2680a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2690a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[7].val[0]);
2700a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2710a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[7].val[1]);
2720a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2730a39d0a697ff3603e8c100300fda363658e10b23James Zern
2740a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[8].val[0]);
2750a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2760a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[8].val[1]);
2770a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2780a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[9].val[0]);
2790a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2800a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[9].val[1]);
2810a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2820a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[10].val[0]);
2830a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2840a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[10].val[1]);
2850a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2860a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[11].val[0]);
2870a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2880a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[11].val[1]);
2890a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2900a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[12].val[0]);
2910a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2920a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[12].val[1]);
2930a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2940a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[13].val[0]);
2950a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2960a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[13].val[1]);
2970a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
2980a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[14].val[0]);
2990a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3000a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[14].val[1]);
3010a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3020a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[15].val[0]);
3030a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3040a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[15].val[1]);
3050a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3060a39d0a697ff3603e8c100300fda363658e10b23James Zern
3070a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[16].val[0]);
3080a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3090a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[16].val[1]);
3100a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3110a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[17].val[0]);
3120a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3130a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[17].val[1]);
3140a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3150a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[18].val[0]);
3160a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3170a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[18].val[1]);
3180a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3190a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[19].val[0]);
3200a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3210a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[19].val[1]);
3220a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3230a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[20].val[0]);
3240a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3250a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[20].val[1]);
3260a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3270a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[21].val[0]);
3280a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3290a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[21].val[1]);
3300a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3310a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[22].val[0]);
3320a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3330a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[22].val[1]);
3340a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3350a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[23].val[0]);
3360a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3370a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[23].val[1]);
3380a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3390a39d0a697ff3603e8c100300fda363658e10b23James Zern
3400a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[24].val[0]);
3410a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3420a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[24].val[1]);
3430a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3440a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[25].val[0]);
3450a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3460a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[25].val[1]);
3470a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3480a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[26].val[0]);
3490a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3500a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[26].val[1]);
3510a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3520a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[27].val[0]);
3530a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3540a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[27].val[1]);
3550a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3560a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[28].val[0]);
3570a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3580a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[28].val[1]);
3590a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3600a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[29].val[0]);
3610a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3620a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[29].val[1]);
3630a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3640a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[30].val[0]);
3650a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3660a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[30].val[1]);
3670a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3680a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[31].val[0]);
3690a39d0a697ff3603e8c100300fda363658e10b23James Zern  output += 4;
3700a39d0a697ff3603e8c100300fda363658e10b23James Zern  vst1q_s32(output, s3[31].val[1]);
3710a39d0a697ff3603e8c100300fda363658e10b23James Zern}
3720a39d0a697ff3603e8c100300fda363658e10b23James Zern
3730a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output,
3740a39d0a697ff3603e8c100300fda363658e10b23James Zern                                     int stride, const int bd) {
3750a39d0a697ff3603e8c100300fda363658e10b23James Zern  int32x4x2_t in[8], s1[32], s2[32], s3[32], out[32];
3760a39d0a697ff3603e8c100300fda363658e10b23James Zern
3770a39d0a697ff3603e8c100300fda363658e10b23James Zern  load_and_transpose_s32_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
3780a39d0a697ff3603e8c100300fda363658e10b23James Zern                             &in[5], &in[6], &in[7]);
3790a39d0a697ff3603e8c100300fda363658e10b23James Zern
3800a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 1
3810a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
3820a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
3830a39d0a697ff3603e8c100300fda363658e10b23James Zern
3840a39d0a697ff3603e8c100300fda363658e10b23James Zern  // Different for _8_
3850a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
3860a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
3870a39d0a697ff3603e8c100300fda363658e10b23James Zern
3880a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
3890a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
3900a39d0a697ff3603e8c100300fda363658e10b23James Zern
3910a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
3920a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
3930a39d0a697ff3603e8c100300fda363658e10b23James Zern
3940a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 2
3950a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
3960a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
3970a39d0a697ff3603e8c100300fda363658e10b23James Zern
3980a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
3990a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
4000a39d0a697ff3603e8c100300fda363658e10b23James Zern
4010a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 3
4020a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
4030a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
4040a39d0a697ff3603e8c100300fda363658e10b23James Zern
4050a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
4060a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[31], cospi_28_64);
4070a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
4080a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[31], cospi_4_64);
4090a39d0a697ff3603e8c100300fda363658e10b23James Zern
4100a39d0a697ff3603e8c100300fda363658e10b23James Zern  // Different for _8_
4110a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_28_64,
4120a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[28], -cospi_4_64);
4130a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_4_64,
4140a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[28], cospi_28_64);
4150a39d0a697ff3603e8c100300fda363658e10b23James Zern
4160a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
4170a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[27], cospi_12_64);
4180a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
4190a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[27], cospi_20_64);
4200a39d0a697ff3603e8c100300fda363658e10b23James Zern
4210a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
4220a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[24], -cospi_20_64);
4230a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
4240a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s1[24], cospi_12_64);
4250a39d0a697ff3603e8c100300fda363658e10b23James Zern
4260a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 4
4270a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
4280a39d0a697ff3603e8c100300fda363658e10b23James Zern
4290a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
4300a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                        s2[15], cospi_24_64);
4310a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
4320a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[15], cospi_8_64);
4330a39d0a697ff3603e8c100300fda363658e10b23James Zern
4340a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[10] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_24_64,
4350a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[12], -cospi_8_64);
4360a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[13] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_8_64,
4370a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[12], cospi_24_64);
4380a39d0a697ff3603e8c100300fda363658e10b23James Zern
4390a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[16] = highbd_idct_add_dual(s1[16], s1[19]);
4400a39d0a697ff3603e8c100300fda363658e10b23James Zern
4410a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[17] = highbd_idct_add_dual(s1[17], s1[18]);
4420a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[18] = highbd_idct_sub_dual(s1[17], s1[18]);
4430a39d0a697ff3603e8c100300fda363658e10b23James Zern
4440a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[19] = highbd_idct_sub_dual(s1[16], s1[19]);
4450a39d0a697ff3603e8c100300fda363658e10b23James Zern
4460a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
4470a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
4480a39d0a697ff3603e8c100300fda363658e10b23James Zern
4490a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
4500a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
4510a39d0a697ff3603e8c100300fda363658e10b23James Zern
4520a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
4530a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
4540a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
4550a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
4560a39d0a697ff3603e8c100300fda363658e10b23James Zern
4570a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[28] = highbd_idct_sub_dual(s1[31], s1[28]);
4580a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[29] = highbd_idct_sub_dual(s1[30], s1[29]);
4590a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[30] = highbd_idct_add_dual(s1[29], s1[30]);
4600a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[31] = highbd_idct_add_dual(s1[28], s1[31]);
4610a39d0a697ff3603e8c100300fda363658e10b23James Zern
4620a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 5
4630a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
4640a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
4650a39d0a697ff3603e8c100300fda363658e10b23James Zern
4660a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[8] = highbd_idct_add_dual(s2[8], s2[11]);
4670a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[9] = highbd_idct_add_dual(s2[9], s2[10]);
4680a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[10] = highbd_idct_sub_dual(s2[9], s2[10]);
4690a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[11] = highbd_idct_sub_dual(s2[8], s2[11]);
4700a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[12] = highbd_idct_sub_dual(s2[15], s2[12]);
4710a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[13] = highbd_idct_sub_dual(s2[14], s2[13]);
4720a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[14] = highbd_idct_add_dual(s2[13], s2[14]);
4730a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[15] = highbd_idct_add_dual(s2[12], s2[15]);
4740a39d0a697ff3603e8c100300fda363658e10b23James Zern
4750a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_8_64,
4760a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[29], cospi_24_64);
4770a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], cospi_24_64,
4780a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[29], cospi_8_64);
4790a39d0a697ff3603e8c100300fda363658e10b23James Zern
4800a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], -cospi_8_64,
4810a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[28], cospi_24_64);
4820a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], cospi_24_64,
4830a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[28], cospi_8_64);
4840a39d0a697ff3603e8c100300fda363658e10b23James Zern
4850a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
4860a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[27], -cospi_8_64);
4870a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
4880a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[27], cospi_24_64);
4890a39d0a697ff3603e8c100300fda363658e10b23James Zern
4900a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
4910a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[26], -cospi_8_64);
4920a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
4930a39d0a697ff3603e8c100300fda363658e10b23James Zern                                                         s2[26], cospi_24_64);
4940a39d0a697ff3603e8c100300fda363658e10b23James Zern
4950a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 6
4960a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
4970a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
4980a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
4990a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
5000a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
5010a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
5020a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
5030a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
5040a39d0a697ff3603e8c100300fda363658e10b23James Zern
5050a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[10] = sub_multiply_shift_and_narrow_s32_dual(s1[13], s1[10], cospi_16_64);
5060a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[13] = add_multiply_shift_and_narrow_s32_dual(s1[10], s1[13], cospi_16_64);
5070a39d0a697ff3603e8c100300fda363658e10b23James Zern
5080a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[11] = sub_multiply_shift_and_narrow_s32_dual(s1[12], s1[11], cospi_16_64);
5090a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[12] = add_multiply_shift_and_narrow_s32_dual(s1[11], s1[12], cospi_16_64);
5100a39d0a697ff3603e8c100300fda363658e10b23James Zern
5110a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[16] = highbd_idct_add_dual(s2[16], s2[23]);
5120a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[17] = highbd_idct_add_dual(s2[17], s2[22]);
5130a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
5140a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
5150a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
5160a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
5170a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[22] = highbd_idct_sub_dual(s2[17], s2[22]);
5180a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[23] = highbd_idct_sub_dual(s2[16], s2[23]);
5190a39d0a697ff3603e8c100300fda363658e10b23James Zern
5200a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[24] = highbd_idct_sub_dual(s2[31], s2[24]);
5210a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[25] = highbd_idct_sub_dual(s2[30], s2[25]);
5220a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
5230a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
5240a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
5250a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
5260a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[30] = highbd_idct_add_dual(s2[25], s2[30]);
5270a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[31] = highbd_idct_add_dual(s2[24], s2[31]);
5280a39d0a697ff3603e8c100300fda363658e10b23James Zern
5290a39d0a697ff3603e8c100300fda363658e10b23James Zern  // stage 7
5300a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[0] = highbd_idct_add_dual(s2[0], s1[15]);
5310a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[1] = highbd_idct_add_dual(s2[1], s1[14]);
5320a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
5330a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
5340a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
5350a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
5360a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[6] = highbd_idct_add_dual(s2[6], s1[9]);
5370a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[7] = highbd_idct_add_dual(s2[7], s1[8]);
5380a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[8] = highbd_idct_sub_dual(s2[7], s1[8]);
5390a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[9] = highbd_idct_sub_dual(s2[6], s1[9]);
5400a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
5410a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
5420a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
5430a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
5440a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[14] = highbd_idct_sub_dual(s2[1], s1[14]);
5450a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[15] = highbd_idct_sub_dual(s2[0], s1[15]);
5460a39d0a697ff3603e8c100300fda363658e10b23James Zern
5470a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
5480a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
5490a39d0a697ff3603e8c100300fda363658e10b23James Zern
5500a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
5510a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
5520a39d0a697ff3603e8c100300fda363658e10b23James Zern
5530a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s1[22], cospi_16_64);
5540a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[25] = add_multiply_shift_and_narrow_s32_dual(s1[22], s3[25], cospi_16_64);
5550a39d0a697ff3603e8c100300fda363658e10b23James Zern
5560a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s1[23], cospi_16_64);
5570a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1[24] = add_multiply_shift_and_narrow_s32_dual(s1[23], s3[24], cospi_16_64);
5580a39d0a697ff3603e8c100300fda363658e10b23James Zern
5590a39d0a697ff3603e8c100300fda363658e10b23James Zern  // final stage
5600a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[0] = highbd_idct_add_dual(s1[0], s2[31]);
5610a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[1] = highbd_idct_add_dual(s1[1], s2[30]);
5620a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[2] = highbd_idct_add_dual(s1[2], s2[29]);
5630a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[3] = highbd_idct_add_dual(s1[3], s2[28]);
5640a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[4] = highbd_idct_add_dual(s1[4], s1[27]);
5650a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[5] = highbd_idct_add_dual(s1[5], s1[26]);
5660a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[6] = highbd_idct_add_dual(s1[6], s1[25]);
5670a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[7] = highbd_idct_add_dual(s1[7], s1[24]);
5680a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[8] = highbd_idct_add_dual(s1[8], s2[23]);
5690a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[9] = highbd_idct_add_dual(s1[9], s2[22]);
5700a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[10] = highbd_idct_add_dual(s1[10], s1[21]);
5710a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[11] = highbd_idct_add_dual(s1[11], s1[20]);
5720a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[12] = highbd_idct_add_dual(s1[12], s2[19]);
5730a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[13] = highbd_idct_add_dual(s1[13], s2[18]);
5740a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[14] = highbd_idct_add_dual(s1[14], s1[17]);
5750a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[15] = highbd_idct_add_dual(s1[15], s1[16]);
5760a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[16] = highbd_idct_sub_dual(s1[15], s1[16]);
5770a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[17] = highbd_idct_sub_dual(s1[14], s1[17]);
5780a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[18] = highbd_idct_sub_dual(s1[13], s2[18]);
5790a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[19] = highbd_idct_sub_dual(s1[12], s2[19]);
5800a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[20] = highbd_idct_sub_dual(s1[11], s1[20]);
5810a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[21] = highbd_idct_sub_dual(s1[10], s1[21]);
5820a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[22] = highbd_idct_sub_dual(s1[9], s2[22]);
5830a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[23] = highbd_idct_sub_dual(s1[8], s2[23]);
5840a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[24] = highbd_idct_sub_dual(s1[7], s1[24]);
5850a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[25] = highbd_idct_sub_dual(s1[6], s1[25]);
5860a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[26] = highbd_idct_sub_dual(s1[5], s1[26]);
5870a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[27] = highbd_idct_sub_dual(s1[4], s1[27]);
5880a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[28] = highbd_idct_sub_dual(s1[3], s2[28]);
5890a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[29] = highbd_idct_sub_dual(s1[2], s2[29]);
5900a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[30] = highbd_idct_sub_dual(s1[1], s2[30]);
5910a39d0a697ff3603e8c100300fda363658e10b23James Zern  out[31] = highbd_idct_sub_dual(s1[0], s2[31]);
5920a39d0a697ff3603e8c100300fda363658e10b23James Zern
5930a39d0a697ff3603e8c100300fda363658e10b23James Zern  highbd_idct16x16_add_store(out, output, stride, bd);
5940a39d0a697ff3603e8c100300fda363658e10b23James Zern  highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
5950a39d0a697ff3603e8c100300fda363658e10b23James Zern}
5960a39d0a697ff3603e8c100300fda363658e10b23James Zern
5970a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest,
5980a39d0a697ff3603e8c100300fda363658e10b23James Zern                                      int stride, int bd) {
5990a39d0a697ff3603e8c100300fda363658e10b23James Zern  int i;
6000a39d0a697ff3603e8c100300fda363658e10b23James Zern
6010a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (bd == 8) {
6020a39d0a697ff3603e8c100300fda363658e10b23James Zern    int16_t temp[32 * 8];
6030a39d0a697ff3603e8c100300fda363658e10b23James Zern    int16_t *t = temp;
6040a39d0a697ff3603e8c100300fda363658e10b23James Zern
6050a39d0a697ff3603e8c100300fda363658e10b23James Zern    vpx_idct32_6_neon(input, t);
6060a39d0a697ff3603e8c100300fda363658e10b23James Zern
6070a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < 32; i += 8) {
6080a39d0a697ff3603e8c100300fda363658e10b23James Zern      vpx_idct32_8_neon(t, dest, stride, 1);
6090a39d0a697ff3603e8c100300fda363658e10b23James Zern      t += (8 * 8);
6100a39d0a697ff3603e8c100300fda363658e10b23James Zern      dest += 8;
6110a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
6120a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {
6130a39d0a697ff3603e8c100300fda363658e10b23James Zern    int32_t temp[32 * 8];
6140a39d0a697ff3603e8c100300fda363658e10b23James Zern    int32_t *t = temp;
6150a39d0a697ff3603e8c100300fda363658e10b23James Zern
6160a39d0a697ff3603e8c100300fda363658e10b23James Zern    vpx_highbd_idct32_6_neon(input, t);
6170a39d0a697ff3603e8c100300fda363658e10b23James Zern
6180a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < 32; i += 8) {
6190a39d0a697ff3603e8c100300fda363658e10b23James Zern      vpx_highbd_idct32_8_neon(t, dest, stride, bd);
6200a39d0a697ff3603e8c100300fda363658e10b23James Zern      t += (8 * 8);
6210a39d0a697ff3603e8c100300fda363658e10b23James Zern      dest += 8;
6220a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
6230a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
6240a39d0a697ff3603e8c100300fda363658e10b23James Zern}
625