1/* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13#include "./vpx_config.h" 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/arm/idct_neon.h" 16#include "vpx_dsp/arm/transpose_neon.h" 17#include "vpx_dsp/txfm_common.h" 18 19static INLINE void load_8x8_s32_dual( 20 const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1, 21 int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4, 22 int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) { 23 in0->val[0] = vld1q_s32(input); 24 in0->val[1] = vld1q_s32(input + 4); 25 input += 32; 26 in1->val[0] = vld1q_s32(input); 27 in1->val[1] = vld1q_s32(input + 4); 28 input += 32; 29 in2->val[0] = vld1q_s32(input); 30 in2->val[1] = vld1q_s32(input + 4); 31 input += 32; 32 in3->val[0] = vld1q_s32(input); 33 in3->val[1] = vld1q_s32(input + 4); 34 input += 32; 35 in4->val[0] = vld1q_s32(input); 36 in4->val[1] = vld1q_s32(input + 4); 37 input += 32; 38 in5->val[0] = vld1q_s32(input); 39 in5->val[1] = vld1q_s32(input + 4); 40 input += 32; 41 in6->val[0] = vld1q_s32(input); 42 in6->val[1] = vld1q_s32(input + 4); 43 input += 32; 44 in7->val[0] = vld1q_s32(input); 45 in7->val[1] = vld1q_s32(input + 4); 46} 47 48static INLINE void load_4x8_s32_dual(const tran_low_t *input, 49 int32x4_t *const in0, int32x4_t *const in1, 50 int32x4_t *const in2, int32x4_t *const in3, 51 int32x4_t *const in4, int32x4_t *const in5, 52 int32x4_t *const in6, 53 int32x4_t *const in7) { 54 *in0 = vld1q_s32(input); 55 input += 32; 56 *in1 = vld1q_s32(input); 57 input += 32; 58 *in2 = vld1q_s32(input); 59 input += 32; 60 *in3 = vld1q_s32(input); 61 input += 32; 62 *in4 = vld1q_s32(input); 63 input += 32; 64 *in5 = vld1q_s32(input); 65 input += 32; 66 *in6 = vld1q_s32(input); 67 input += 32; 68 *in7 = vld1q_s32(input); 69} 70 71// Only for the first pass of the _135_ variant. Since it only uses values from 72// the top left 16x16 it can safely assume all the remaining values are 0 and 73// skip an awful lot of calculations. In fact, only the first 12 columns make 74// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are 75// used so it skips any calls to input[12|13|14|15] too. 76// In C this does a single row of 32 for each call. Here it transposes the top 77// left 12x8 to allow using SIMD. 78 79// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero 80// coefficients as follows: 81// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 82// 0 0 2 5 10 17 25 38 47 62 83 101 121 83// 1 1 4 8 15 22 30 45 58 74 92 112 133 84// 2 3 7 12 18 28 36 52 64 82 102 118 85// 3 6 11 16 23 31 43 60 73 90 109 126 86// 4 9 14 19 29 37 50 65 78 98 116 134 87// 5 13 20 26 35 44 54 72 85 105 123 88// 6 21 27 33 42 53 63 80 94 113 132 89// 7 24 32 39 48 57 71 88 104 120 90// 8 34 40 46 56 68 81 96 111 130 91// 9 41 49 55 67 77 91 107 124 92// 10 51 59 66 76 89 99 119 131 93// 11 61 69 75 87 100 114 129 94// 12 70 79 86 97 108 122 95// 13 84 93 103 110 125 96// 14 98 106 115 127 97// 15 117 128 98static void vpx_highbd_idct32_12_neon(const tran_low_t *const input, 99 int32_t *output) { 100 int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], 101 s8[32]; 102 103 load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], 104 &in[6], &in[7]); 105 transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], 106 &in[7]); 107 108 load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0], 109 &in[9].val[1], &in[10].val[0], &in[10].val[1], 110 &in[11].val[0], &in[11].val[1]); 111 transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1], 112 &in[10].val[0], &in[10].val[1], &in[11].val[0], 113 &in[11].val[1]); 114 115 // stage 1 116 s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); 117 s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); 118 119 s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64); 120 s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64); 121 122 s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); 123 s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); 124 125 s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); 126 s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); 127 128 s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64); 129 s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64); 130 131 s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); 132 s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); 133 134 // stage 2 135 s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); 136 s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); 137 138 s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64); 139 s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64); 140 141 s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); 142 s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); 143 144 s2[18] = highbd_idct_sub_dual(s1[19], s1[18]); 145 s2[19] = highbd_idct_add_dual(s1[18], s1[19]); 146 s2[20] = highbd_idct_add_dual(s1[20], s1[21]); 147 s2[21] = highbd_idct_sub_dual(s1[20], s1[21]); 148 s2[26] = highbd_idct_sub_dual(s1[27], s1[26]); 149 s2[27] = highbd_idct_add_dual(s1[26], s1[27]); 150 s2[28] = highbd_idct_add_dual(s1[28], s1[29]); 151 s2[29] = highbd_idct_sub_dual(s1[28], s1[29]); 152 153 // stage 3 154 s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); 155 s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); 156 157 s3[10] = highbd_idct_sub_dual(s2[11], s2[10]); 158 s3[11] = highbd_idct_add_dual(s2[10], s2[11]); 159 s3[12] = highbd_idct_add_dual(s2[12], s2[13]); 160 s3[13] = highbd_idct_sub_dual(s2[12], s2[13]); 161 162 s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64, 163 s1[31], cospi_28_64); 164 s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64, 165 s1[31], cospi_4_64); 166 167 s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64, 168 s2[29], -cospi_4_64); 169 s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64, 170 s2[29], cospi_28_64); 171 172 s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64, 173 s2[26], cospi_12_64); 174 s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64, 175 s2[26], cospi_20_64); 176 177 s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64, 178 s1[24], -cospi_20_64); 179 s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64, 180 s1[24], cospi_12_64); 181 182 // stage 4 183 s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); 184 s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64); 185 s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64); 186 187 s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64, 188 s2[15], cospi_24_64); 189 s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64, 190 s2[15], cospi_8_64); 191 192 s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64, 193 s3[13], -cospi_8_64); 194 s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64, 195 s3[13], cospi_24_64); 196 197 s4[16] = highbd_idct_add_dual(s1[16], s2[19]); 198 s4[17] = highbd_idct_add_dual(s3[17], s3[18]); 199 s4[18] = highbd_idct_sub_dual(s3[17], s3[18]); 200 s4[19] = highbd_idct_sub_dual(s1[16], s2[19]); 201 s4[20] = highbd_idct_sub_dual(s1[23], s2[20]); 202 s4[21] = highbd_idct_sub_dual(s3[22], s3[21]); 203 s4[22] = highbd_idct_add_dual(s3[21], s3[22]); 204 s4[23] = highbd_idct_add_dual(s2[20], s1[23]); 205 s4[24] = highbd_idct_add_dual(s1[24], s2[27]); 206 s4[25] = highbd_idct_add_dual(s3[25], s3[26]); 207 s4[26] = highbd_idct_sub_dual(s3[25], s3[26]); 208 s4[27] = highbd_idct_sub_dual(s1[24], s2[27]); 209 s4[28] = highbd_idct_sub_dual(s1[31], s2[28]); 210 s4[29] = highbd_idct_sub_dual(s3[30], s3[29]); 211 s4[30] = highbd_idct_add_dual(s3[29], s3[30]); 212 s4[31] = highbd_idct_add_dual(s2[28], s1[31]); 213 214 // stage 5 215 s5[0] = highbd_idct_add_dual(s4[0], s4[3]); 216 s5[1] = highbd_idct_add_dual(s4[0], s4[2]); 217 s5[2] = highbd_idct_sub_dual(s4[0], s4[2]); 218 s5[3] = highbd_idct_sub_dual(s4[0], s4[3]); 219 220 s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64); 221 s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64); 222 223 s5[8] = highbd_idct_add_dual(s2[8], s3[11]); 224 s5[9] = highbd_idct_add_dual(s4[9], s4[10]); 225 s5[10] = highbd_idct_sub_dual(s4[9], s4[10]); 226 s5[11] = highbd_idct_sub_dual(s2[8], s3[11]); 227 s5[12] = highbd_idct_sub_dual(s2[15], s3[12]); 228 s5[13] = highbd_idct_sub_dual(s4[14], s4[13]); 229 s5[14] = highbd_idct_add_dual(s4[13], s4[14]); 230 s5[15] = highbd_idct_add_dual(s2[15], s3[12]); 231 232 s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64, 233 s4[29], cospi_24_64); 234 s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64, 235 s4[29], cospi_8_64); 236 237 s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64, 238 s4[28], cospi_24_64); 239 s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64, 240 s4[28], cospi_8_64); 241 242 s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64, 243 s4[27], -cospi_8_64); 244 s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64, 245 s4[27], cospi_24_64); 246 247 s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64, 248 s4[26], -cospi_8_64); 249 s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64, 250 s4[26], cospi_24_64); 251 252 // stage 6 253 s6[0] = highbd_idct_add_dual(s5[0], s3[7]); 254 s6[1] = highbd_idct_add_dual(s5[1], s5[6]); 255 s6[2] = highbd_idct_add_dual(s5[2], s5[5]); 256 s6[3] = highbd_idct_add_dual(s5[3], s3[4]); 257 s6[4] = highbd_idct_sub_dual(s5[3], s3[4]); 258 s6[5] = highbd_idct_sub_dual(s5[2], s5[5]); 259 s6[6] = highbd_idct_sub_dual(s5[1], s5[6]); 260 s6[7] = highbd_idct_sub_dual(s5[0], s3[7]); 261 262 s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64); 263 s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64); 264 265 s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64); 266 s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64); 267 268 s6[16] = highbd_idct_add_dual(s4[16], s4[23]); 269 s6[17] = highbd_idct_add_dual(s4[17], s4[22]); 270 s6[18] = highbd_idct_add_dual(s5[18], s5[21]); 271 s6[19] = highbd_idct_add_dual(s5[19], s5[20]); 272 s6[20] = highbd_idct_sub_dual(s5[19], s5[20]); 273 s6[21] = highbd_idct_sub_dual(s5[18], s5[21]); 274 s6[22] = highbd_idct_sub_dual(s4[17], s4[22]); 275 s6[23] = highbd_idct_sub_dual(s4[16], s4[23]); 276 277 s6[24] = highbd_idct_sub_dual(s4[31], s4[24]); 278 s6[25] = highbd_idct_sub_dual(s4[30], s4[25]); 279 s6[26] = highbd_idct_sub_dual(s5[29], s5[26]); 280 s6[27] = highbd_idct_sub_dual(s5[28], s5[27]); 281 s6[28] = highbd_idct_add_dual(s5[27], s5[28]); 282 s6[29] = highbd_idct_add_dual(s5[26], s5[29]); 283 s6[30] = highbd_idct_add_dual(s4[25], s4[30]); 284 s6[31] = highbd_idct_add_dual(s4[24], s4[31]); 285 286 // stage 7 287 s7[0] = highbd_idct_add_dual(s6[0], s5[15]); 288 s7[1] = highbd_idct_add_dual(s6[1], s5[14]); 289 s7[2] = highbd_idct_add_dual(s6[2], s6[13]); 290 s7[3] = highbd_idct_add_dual(s6[3], s6[12]); 291 s7[4] = highbd_idct_add_dual(s6[4], s6[11]); 292 s7[5] = highbd_idct_add_dual(s6[5], s6[10]); 293 s7[6] = highbd_idct_add_dual(s6[6], s5[9]); 294 s7[7] = highbd_idct_add_dual(s6[7], s5[8]); 295 s7[8] = highbd_idct_sub_dual(s6[7], s5[8]); 296 s7[9] = highbd_idct_sub_dual(s6[6], s5[9]); 297 s7[10] = highbd_idct_sub_dual(s6[5], s6[10]); 298 s7[11] = highbd_idct_sub_dual(s6[4], s6[11]); 299 s7[12] = highbd_idct_sub_dual(s6[3], s6[12]); 300 s7[13] = highbd_idct_sub_dual(s6[2], s6[13]); 301 s7[14] = highbd_idct_sub_dual(s6[1], s5[14]); 302 s7[15] = highbd_idct_sub_dual(s6[0], s5[15]); 303 304 s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64); 305 s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64); 306 307 s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64); 308 s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64); 309 310 s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64); 311 s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64); 312 313 s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64); 314 s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64); 315 316 // final stage 317 s8[0] = highbd_idct_add_dual(s7[0], s6[31]); 318 s8[1] = highbd_idct_add_dual(s7[1], s6[30]); 319 s8[2] = highbd_idct_add_dual(s7[2], s6[29]); 320 s8[3] = highbd_idct_add_dual(s7[3], s6[28]); 321 s8[4] = highbd_idct_add_dual(s7[4], s7[27]); 322 s8[5] = highbd_idct_add_dual(s7[5], s7[26]); 323 s8[6] = highbd_idct_add_dual(s7[6], s7[25]); 324 s8[7] = highbd_idct_add_dual(s7[7], s7[24]); 325 s8[8] = highbd_idct_add_dual(s7[8], s7[23]); 326 s8[9] = highbd_idct_add_dual(s7[9], s7[22]); 327 s8[10] = highbd_idct_add_dual(s7[10], s7[21]); 328 s8[11] = highbd_idct_add_dual(s7[11], s7[20]); 329 s8[12] = highbd_idct_add_dual(s7[12], s6[19]); 330 s8[13] = highbd_idct_add_dual(s7[13], s6[18]); 331 s8[14] = highbd_idct_add_dual(s7[14], s6[17]); 332 s8[15] = highbd_idct_add_dual(s7[15], s6[16]); 333 s8[16] = highbd_idct_sub_dual(s7[15], s6[16]); 334 s8[17] = highbd_idct_sub_dual(s7[14], s6[17]); 335 s8[18] = highbd_idct_sub_dual(s7[13], s6[18]); 336 s8[19] = highbd_idct_sub_dual(s7[12], s6[19]); 337 s8[20] = highbd_idct_sub_dual(s7[11], s7[20]); 338 s8[21] = highbd_idct_sub_dual(s7[10], s7[21]); 339 s8[22] = highbd_idct_sub_dual(s7[9], s7[22]); 340 s8[23] = highbd_idct_sub_dual(s7[8], s7[23]); 341 s8[24] = highbd_idct_sub_dual(s7[7], s7[24]); 342 s8[25] = highbd_idct_sub_dual(s7[6], s7[25]); 343 s8[26] = highbd_idct_sub_dual(s7[5], s7[26]); 344 s8[27] = highbd_idct_sub_dual(s7[4], s7[27]); 345 s8[28] = highbd_idct_sub_dual(s7[3], s6[28]); 346 s8[29] = highbd_idct_sub_dual(s7[2], s6[29]); 347 s8[30] = highbd_idct_sub_dual(s7[1], s6[30]); 348 s8[31] = highbd_idct_sub_dual(s7[0], s6[31]); 349 350 vst1q_s32(output + 0, s8[0].val[0]); 351 vst1q_s32(output + 4, s8[0].val[1]); 352 output += 16; 353 vst1q_s32(output + 0, s8[1].val[0]); 354 vst1q_s32(output + 4, s8[1].val[1]); 355 output += 16; 356 vst1q_s32(output + 0, s8[2].val[0]); 357 vst1q_s32(output + 4, s8[2].val[1]); 358 output += 16; 359 vst1q_s32(output + 0, s8[3].val[0]); 360 vst1q_s32(output + 4, s8[3].val[1]); 361 output += 16; 362 vst1q_s32(output + 0, s8[4].val[0]); 363 vst1q_s32(output + 4, s8[4].val[1]); 364 output += 16; 365 vst1q_s32(output + 0, s8[5].val[0]); 366 vst1q_s32(output + 4, s8[5].val[1]); 367 output += 16; 368 vst1q_s32(output + 0, s8[6].val[0]); 369 vst1q_s32(output + 4, s8[6].val[1]); 370 output += 16; 371 vst1q_s32(output + 0, s8[7].val[0]); 372 vst1q_s32(output + 4, s8[7].val[1]); 373 output += 16; 374 375 vst1q_s32(output + 0, s8[8].val[0]); 376 vst1q_s32(output + 4, s8[8].val[1]); 377 output += 16; 378 vst1q_s32(output + 0, s8[9].val[0]); 379 vst1q_s32(output + 4, s8[9].val[1]); 380 output += 16; 381 vst1q_s32(output + 0, s8[10].val[0]); 382 vst1q_s32(output + 4, s8[10].val[1]); 383 output += 16; 384 vst1q_s32(output + 0, s8[11].val[0]); 385 vst1q_s32(output + 4, s8[11].val[1]); 386 output += 16; 387 vst1q_s32(output + 0, s8[12].val[0]); 388 vst1q_s32(output + 4, s8[12].val[1]); 389 output += 16; 390 vst1q_s32(output + 0, s8[13].val[0]); 391 vst1q_s32(output + 4, s8[13].val[1]); 392 output += 16; 393 vst1q_s32(output + 0, s8[14].val[0]); 394 vst1q_s32(output + 4, s8[14].val[1]); 395 output += 16; 396 vst1q_s32(output + 0, s8[15].val[0]); 397 vst1q_s32(output + 4, s8[15].val[1]); 398 output += 16; 399 400 vst1q_s32(output + 0, s8[16].val[0]); 401 vst1q_s32(output + 4, s8[16].val[1]); 402 output += 16; 403 vst1q_s32(output + 0, s8[17].val[0]); 404 vst1q_s32(output + 4, s8[17].val[1]); 405 output += 16; 406 vst1q_s32(output + 0, s8[18].val[0]); 407 vst1q_s32(output + 4, s8[18].val[1]); 408 output += 16; 409 vst1q_s32(output + 0, s8[19].val[0]); 410 vst1q_s32(output + 4, s8[19].val[1]); 411 output += 16; 412 vst1q_s32(output + 0, s8[20].val[0]); 413 vst1q_s32(output + 4, s8[20].val[1]); 414 output += 16; 415 vst1q_s32(output + 0, s8[21].val[0]); 416 vst1q_s32(output + 4, s8[21].val[1]); 417 output += 16; 418 vst1q_s32(output + 0, s8[22].val[0]); 419 vst1q_s32(output + 4, s8[22].val[1]); 420 output += 16; 421 vst1q_s32(output + 0, s8[23].val[0]); 422 vst1q_s32(output + 4, s8[23].val[1]); 423 output += 16; 424 425 vst1q_s32(output + 0, s8[24].val[0]); 426 vst1q_s32(output + 4, s8[24].val[1]); 427 output += 16; 428 vst1q_s32(output + 0, s8[25].val[0]); 429 vst1q_s32(output + 4, s8[25].val[1]); 430 output += 16; 431 vst1q_s32(output + 0, s8[26].val[0]); 432 vst1q_s32(output + 4, s8[26].val[1]); 433 output += 16; 434 vst1q_s32(output + 0, s8[27].val[0]); 435 vst1q_s32(output + 4, s8[27].val[1]); 436 output += 16; 437 vst1q_s32(output + 0, s8[28].val[0]); 438 vst1q_s32(output + 4, s8[28].val[1]); 439 output += 16; 440 vst1q_s32(output + 0, s8[29].val[0]); 441 vst1q_s32(output + 4, s8[29].val[1]); 442 output += 16; 443 vst1q_s32(output + 0, s8[30].val[0]); 444 vst1q_s32(output + 4, s8[30].val[1]); 445 output += 16; 446 vst1q_s32(output + 0, s8[31].val[0]); 447 vst1q_s32(output + 4, s8[31].val[1]); 448} 449 450static void vpx_highbd_idct32_16_neon(const int32_t *const input, 451 uint16_t *const output, const int stride, 452 const int bd) { 453 int32x4x2_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], 454 out[32]; 455 456 load_and_transpose_s32_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4], 457 &in[5], &in[6], &in[7]); 458 459 load_and_transpose_s32_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11], 460 &in[12], &in[13], &in[14], &in[15]); 461 462 // stage 1 463 s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); 464 s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); 465 466 s1[17] = multiply_shift_and_narrow_s32_dual(in[15], -cospi_17_64); 467 s1[30] = multiply_shift_and_narrow_s32_dual(in[15], cospi_15_64); 468 469 s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64); 470 s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64); 471 472 s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); 473 s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); 474 475 s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); 476 s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); 477 478 s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64); 479 s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64); 480 481 s1[22] = multiply_shift_and_narrow_s32_dual(in[13], cospi_19_64); 482 s1[25] = multiply_shift_and_narrow_s32_dual(in[13], cospi_13_64); 483 484 s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); 485 s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); 486 487 // stage 2 488 s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); 489 s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); 490 491 s2[9] = multiply_shift_and_narrow_s32_dual(in[14], -cospi_18_64); 492 s2[14] = multiply_shift_and_narrow_s32_dual(in[14], cospi_14_64); 493 494 s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64); 495 s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64); 496 497 s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); 498 s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); 499 500 s2[16] = highbd_idct_add_dual(s1[16], s1[17]); 501 s2[17] = highbd_idct_sub_dual(s1[16], s1[17]); 502 s2[18] = highbd_idct_sub_dual(s1[19], s1[18]); 503 s2[19] = highbd_idct_add_dual(s1[18], s1[19]); 504 s2[20] = highbd_idct_add_dual(s1[20], s1[21]); 505 s2[21] = highbd_idct_sub_dual(s1[20], s1[21]); 506 s2[22] = highbd_idct_sub_dual(s1[23], s1[22]); 507 s2[23] = highbd_idct_add_dual(s1[22], s1[23]); 508 s2[24] = highbd_idct_add_dual(s1[24], s1[25]); 509 s2[25] = highbd_idct_sub_dual(s1[24], s1[25]); 510 s2[26] = highbd_idct_sub_dual(s1[27], s1[26]); 511 s2[27] = highbd_idct_add_dual(s1[26], s1[27]); 512 s2[28] = highbd_idct_add_dual(s1[28], s1[29]); 513 s2[29] = highbd_idct_sub_dual(s1[28], s1[29]); 514 s2[30] = highbd_idct_sub_dual(s1[31], s1[30]); 515 s2[31] = highbd_idct_add_dual(s1[30], s1[31]); 516 517 // stage 3 518 s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); 519 s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); 520 521 s3[5] = multiply_shift_and_narrow_s32_dual(in[12], -cospi_20_64); 522 s3[6] = multiply_shift_and_narrow_s32_dual(in[12], cospi_12_64); 523 524 s3[8] = highbd_idct_add_dual(s2[8], s2[9]); 525 s3[9] = highbd_idct_sub_dual(s2[8], s2[9]); 526 s3[10] = highbd_idct_sub_dual(s2[11], s2[10]); 527 s3[11] = highbd_idct_add_dual(s2[10], s2[11]); 528 s3[12] = highbd_idct_add_dual(s2[12], s2[13]); 529 s3[13] = highbd_idct_sub_dual(s2[12], s2[13]); 530 s3[14] = highbd_idct_sub_dual(s2[15], s2[14]); 531 s3[15] = highbd_idct_add_dual(s2[14], s2[15]); 532 533 s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], -cospi_4_64, 534 s2[30], cospi_28_64); 535 s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], cospi_28_64, 536 s2[30], cospi_4_64); 537 538 s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64, 539 s2[29], -cospi_4_64); 540 s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64, 541 s2[29], cospi_28_64); 542 543 s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64, 544 s2[26], cospi_12_64); 545 s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64, 546 s2[26], cospi_20_64); 547 548 s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_12_64, 549 s2[25], -cospi_20_64); 550 s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_20_64, 551 s2[25], cospi_12_64); 552 553 // stage 4 554 s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); 555 s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64); 556 s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64); 557 558 s4[4] = highbd_idct_add_dual(s3[4], s3[5]); 559 s4[5] = highbd_idct_sub_dual(s3[4], s3[5]); 560 s4[6] = highbd_idct_sub_dual(s3[7], s3[6]); 561 s4[7] = highbd_idct_add_dual(s3[6], s3[7]); 562 563 s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], -cospi_8_64, 564 s3[14], cospi_24_64); 565 s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], cospi_24_64, 566 s3[14], cospi_8_64); 567 568 s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64, 569 s3[13], -cospi_8_64); 570 s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64, 571 s3[13], cospi_24_64); 572 573 s4[16] = highbd_idct_add_dual(s2[16], s2[19]); 574 s4[17] = highbd_idct_add_dual(s3[17], s3[18]); 575 s4[18] = highbd_idct_sub_dual(s3[17], s3[18]); 576 s4[19] = highbd_idct_sub_dual(s2[16], s2[19]); 577 s4[20] = highbd_idct_sub_dual(s2[23], s2[20]); 578 s4[21] = highbd_idct_sub_dual(s3[22], s3[21]); 579 s4[22] = highbd_idct_add_dual(s3[21], s3[22]); 580 s4[23] = highbd_idct_add_dual(s2[20], s2[23]); 581 s4[24] = highbd_idct_add_dual(s2[24], s2[27]); 582 s4[25] = highbd_idct_add_dual(s3[25], s3[26]); 583 s4[26] = highbd_idct_sub_dual(s3[25], s3[26]); 584 s4[27] = highbd_idct_sub_dual(s2[24], s2[27]); 585 s4[28] = highbd_idct_sub_dual(s2[31], s2[28]); 586 s4[29] = highbd_idct_sub_dual(s3[30], s3[29]); 587 s4[30] = highbd_idct_add_dual(s3[29], s3[30]); 588 s4[31] = highbd_idct_add_dual(s2[28], s2[31]); 589 590 // stage 5 591 s5[0] = highbd_idct_add_dual(s4[0], s4[3]); 592 s5[1] = highbd_idct_add_dual(s4[0], s4[2]); 593 s5[2] = highbd_idct_sub_dual(s4[0], s4[2]); 594 s5[3] = highbd_idct_sub_dual(s4[0], s4[3]); 595 596 s5[5] = sub_multiply_shift_and_narrow_s32_dual(s4[6], s4[5], cospi_16_64); 597 s5[6] = add_multiply_shift_and_narrow_s32_dual(s4[5], s4[6], cospi_16_64); 598 599 s5[8] = highbd_idct_add_dual(s3[8], s3[11]); 600 s5[9] = highbd_idct_add_dual(s4[9], s4[10]); 601 s5[10] = highbd_idct_sub_dual(s4[9], s4[10]); 602 s5[11] = highbd_idct_sub_dual(s3[8], s3[11]); 603 s5[12] = highbd_idct_sub_dual(s3[15], s3[12]); 604 s5[13] = highbd_idct_sub_dual(s4[14], s4[13]); 605 s5[14] = highbd_idct_add_dual(s4[13], s4[14]); 606 s5[15] = highbd_idct_add_dual(s3[15], s3[12]); 607 608 s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64, 609 s4[29], cospi_24_64); 610 s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64, 611 s4[29], cospi_8_64); 612 613 s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64, 614 s4[28], cospi_24_64); 615 s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64, 616 s4[28], cospi_8_64); 617 618 s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64, 619 s4[27], -cospi_8_64); 620 s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64, 621 s4[27], cospi_24_64); 622 623 s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64, 624 s4[26], -cospi_8_64); 625 s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64, 626 s4[26], cospi_24_64); 627 628 // stage 6 629 s6[0] = highbd_idct_add_dual(s5[0], s4[7]); 630 s6[1] = highbd_idct_add_dual(s5[1], s5[6]); 631 s6[2] = highbd_idct_add_dual(s5[2], s5[5]); 632 s6[3] = highbd_idct_add_dual(s5[3], s4[4]); 633 s6[4] = highbd_idct_sub_dual(s5[3], s4[4]); 634 s6[5] = highbd_idct_sub_dual(s5[2], s5[5]); 635 s6[6] = highbd_idct_sub_dual(s5[1], s5[6]); 636 s6[7] = highbd_idct_sub_dual(s5[0], s4[7]); 637 638 s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64); 639 s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64); 640 641 s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64); 642 s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64); 643 644 s6[16] = highbd_idct_add_dual(s4[16], s4[23]); 645 s6[17] = highbd_idct_add_dual(s4[17], s4[22]); 646 s6[18] = highbd_idct_add_dual(s5[18], s5[21]); 647 s6[19] = highbd_idct_add_dual(s5[19], s5[20]); 648 s6[20] = highbd_idct_sub_dual(s5[19], s5[20]); 649 s6[21] = highbd_idct_sub_dual(s5[18], s5[21]); 650 s6[22] = highbd_idct_sub_dual(s4[17], s4[22]); 651 s6[23] = highbd_idct_sub_dual(s4[16], s4[23]); 652 s6[24] = highbd_idct_sub_dual(s4[31], s4[24]); 653 s6[25] = highbd_idct_sub_dual(s4[30], s4[25]); 654 s6[26] = highbd_idct_sub_dual(s5[29], s5[26]); 655 s6[27] = highbd_idct_sub_dual(s5[28], s5[27]); 656 s6[28] = highbd_idct_add_dual(s5[27], s5[28]); 657 s6[29] = highbd_idct_add_dual(s5[26], s5[29]); 658 s6[30] = highbd_idct_add_dual(s4[25], s4[30]); 659 s6[31] = highbd_idct_add_dual(s4[24], s4[31]); 660 661 // stage 7 662 s7[0] = highbd_idct_add_dual(s6[0], s5[15]); 663 s7[1] = highbd_idct_add_dual(s6[1], s5[14]); 664 s7[2] = highbd_idct_add_dual(s6[2], s6[13]); 665 s7[3] = highbd_idct_add_dual(s6[3], s6[12]); 666 s7[4] = highbd_idct_add_dual(s6[4], s6[11]); 667 s7[5] = highbd_idct_add_dual(s6[5], s6[10]); 668 s7[6] = highbd_idct_add_dual(s6[6], s5[9]); 669 s7[7] = highbd_idct_add_dual(s6[7], s5[8]); 670 s7[8] = highbd_idct_sub_dual(s6[7], s5[8]); 671 s7[9] = highbd_idct_sub_dual(s6[6], s5[9]); 672 s7[10] = highbd_idct_sub_dual(s6[5], s6[10]); 673 s7[11] = highbd_idct_sub_dual(s6[4], s6[11]); 674 s7[12] = highbd_idct_sub_dual(s6[3], s6[12]); 675 s7[13] = highbd_idct_sub_dual(s6[2], s6[13]); 676 s7[14] = highbd_idct_sub_dual(s6[1], s5[14]); 677 s7[15] = highbd_idct_sub_dual(s6[0], s5[15]); 678 679 s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64); 680 s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64); 681 682 s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64); 683 s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64); 684 685 s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64); 686 s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64); 687 688 s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64); 689 s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64); 690 691 // final stage 692 out[0] = highbd_idct_add_dual(s7[0], s6[31]); 693 out[1] = highbd_idct_add_dual(s7[1], s6[30]); 694 out[2] = highbd_idct_add_dual(s7[2], s6[29]); 695 out[3] = highbd_idct_add_dual(s7[3], s6[28]); 696 out[4] = highbd_idct_add_dual(s7[4], s7[27]); 697 out[5] = highbd_idct_add_dual(s7[5], s7[26]); 698 out[6] = highbd_idct_add_dual(s7[6], s7[25]); 699 out[7] = highbd_idct_add_dual(s7[7], s7[24]); 700 out[8] = highbd_idct_add_dual(s7[8], s7[23]); 701 out[9] = highbd_idct_add_dual(s7[9], s7[22]); 702 out[10] = highbd_idct_add_dual(s7[10], s7[21]); 703 out[11] = highbd_idct_add_dual(s7[11], s7[20]); 704 out[12] = highbd_idct_add_dual(s7[12], s6[19]); 705 out[13] = highbd_idct_add_dual(s7[13], s6[18]); 706 out[14] = highbd_idct_add_dual(s7[14], s6[17]); 707 out[15] = highbd_idct_add_dual(s7[15], s6[16]); 708 out[16] = highbd_idct_sub_dual(s7[15], s6[16]); 709 out[17] = highbd_idct_sub_dual(s7[14], s6[17]); 710 out[18] = highbd_idct_sub_dual(s7[13], s6[18]); 711 out[19] = highbd_idct_sub_dual(s7[12], s6[19]); 712 out[20] = highbd_idct_sub_dual(s7[11], s7[20]); 713 out[21] = highbd_idct_sub_dual(s7[10], s7[21]); 714 out[22] = highbd_idct_sub_dual(s7[9], s7[22]); 715 out[23] = highbd_idct_sub_dual(s7[8], s7[23]); 716 out[24] = highbd_idct_sub_dual(s7[7], s7[24]); 717 out[25] = highbd_idct_sub_dual(s7[6], s7[25]); 718 out[26] = highbd_idct_sub_dual(s7[5], s7[26]); 719 out[27] = highbd_idct_sub_dual(s7[4], s7[27]); 720 out[28] = highbd_idct_sub_dual(s7[3], s6[28]); 721 out[29] = highbd_idct_sub_dual(s7[2], s6[29]); 722 out[30] = highbd_idct_sub_dual(s7[1], s6[30]); 723 out[31] = highbd_idct_sub_dual(s7[0], s6[31]); 724 725 highbd_idct16x16_add_store(out, output, stride, bd); 726 highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); 727} 728 729void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, 730 int stride, int bd) { 731 int i; 732 733 if (bd == 8) { 734 int16_t temp[32 * 16]; 735 int16_t *t = temp; 736 vpx_idct32_12_neon(input, temp); 737 vpx_idct32_12_neon(input + 32 * 8, temp + 8); 738 739 for (i = 0; i < 32; i += 8) { 740 vpx_idct32_16_neon(t, dest, stride, 1); 741 t += (16 * 8); 742 dest += 8; 743 } 744 } else { 745 int32_t temp[32 * 16]; 746 int32_t *t = temp; 747 vpx_highbd_idct32_12_neon(input, temp); 748 vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8); 749 750 for (i = 0; i < 32; i += 8) { 751 vpx_highbd_idct32_16_neon(t, dest, stride, bd); 752 t += (16 * 8); 753 dest += 8; 754 } 755 } 756} 757