1/* 2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13#include "./vpx_config.h" 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/arm/idct_neon.h" 16#include "vpx_dsp/arm/mem_neon.h" 17#include "vpx_dsp/arm/transpose_neon.h" 18#include "vpx_dsp/txfm_common.h" 19 20static INLINE void load_8x8_s16(const tran_low_t *input, int16x8_t *const in0, 21 int16x8_t *const in1, int16x8_t *const in2, 22 int16x8_t *const in3, int16x8_t *const in4, 23 int16x8_t *const in5, int16x8_t *const in6, 24 int16x8_t *const in7) { 25 *in0 = load_tran_low_to_s16q(input); 26 input += 32; 27 *in1 = load_tran_low_to_s16q(input); 28 input += 32; 29 *in2 = load_tran_low_to_s16q(input); 30 input += 32; 31 *in3 = load_tran_low_to_s16q(input); 32 input += 32; 33 *in4 = load_tran_low_to_s16q(input); 34 input += 32; 35 *in5 = load_tran_low_to_s16q(input); 36 input += 32; 37 *in6 = load_tran_low_to_s16q(input); 38 input += 32; 39 *in7 = load_tran_low_to_s16q(input); 40} 41 42static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0, 43 int16x4_t *const in1, int16x4_t *const in2, 44 int16x4_t *const in3, int16x4_t *const in4, 45 int16x4_t *const in5, int16x4_t *const in6, 46 int16x4_t *const in7) { 47 *in0 = load_tran_low_to_s16d(input); 48 input += 32; 49 *in1 = load_tran_low_to_s16d(input); 50 input += 32; 51 *in2 = load_tran_low_to_s16d(input); 52 input += 32; 53 *in3 = load_tran_low_to_s16d(input); 54 input += 32; 55 *in4 = load_tran_low_to_s16d(input); 56 input += 32; 57 *in5 = load_tran_low_to_s16d(input); 58 input += 32; 59 *in6 = load_tran_low_to_s16d(input); 60 input += 32; 61 *in7 = load_tran_low_to_s16d(input); 62} 63 64// Only for the first pass of the _135_ variant. Since it only uses values from 65// the top left 16x16 it can safely assume all the remaining values are 0 and 66// skip an awful lot of calculations. In fact, only the first 12 columns make 67// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are 68// used so it skips any calls to input[12|13|14|15] too. 69// In C this does a single row of 32 for each call. Here it transposes the top 70// left 12x8 to allow using SIMD. 71 72// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero 73// coefficients as follows: 74// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 75// 0 0 2 5 10 17 25 38 47 62 83 101 121 76// 1 1 4 8 15 22 30 45 58 74 92 112 133 77// 2 3 7 12 18 28 36 52 64 82 102 118 78// 3 6 11 16 23 31 43 60 73 90 109 126 79// 4 9 14 19 29 37 50 65 78 98 116 134 80// 5 13 20 26 35 44 54 72 85 105 123 81// 6 21 27 33 42 53 63 80 94 113 132 82// 7 24 32 39 48 57 71 88 104 120 83// 8 34 40 46 56 68 81 96 111 130 84// 9 41 49 55 67 77 91 107 124 85// 10 51 59 66 76 89 99 119 131 86// 11 61 69 75 87 100 114 129 87// 12 70 79 86 97 108 122 88// 13 84 93 103 110 125 89// 14 98 106 115 127 90// 15 117 128 91void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output) { 92 int16x4_t tmp[8]; 93 int16x8_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32]; 94 95 load_8x8_s16(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], 96 &in[7]); 97 transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], 98 &in[7]); 99 100 load_4x8_s16(input + 8, &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5], 101 &tmp[6], &tmp[7]); 102 transpose_s16_4x8(tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6], 103 tmp[7], &in[8], &in[9], &in[10], &in[11]); 104 105 // stage 1 106 s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); 107 s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); 108 109 s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64); 110 s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64); 111 112 s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); 113 s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); 114 115 s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); 116 s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); 117 118 s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64); 119 s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64); 120 121 s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); 122 s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); 123 124 // stage 2 125 s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); 126 s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); 127 128 s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64); 129 s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64); 130 131 s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); 132 s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); 133 134 s2[18] = vsubq_s16(s1[19], s1[18]); 135 s2[19] = vaddq_s16(s1[18], s1[19]); 136 s2[20] = vaddq_s16(s1[20], s1[21]); 137 s2[21] = vsubq_s16(s1[20], s1[21]); 138 s2[26] = vsubq_s16(s1[27], s1[26]); 139 s2[27] = vaddq_s16(s1[26], s1[27]); 140 s2[28] = vaddq_s16(s1[28], s1[29]); 141 s2[29] = vsubq_s16(s1[28], s1[29]); 142 143 // stage 3 144 s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); 145 s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); 146 147 s3[10] = vsubq_s16(s2[11], s2[10]); 148 s3[11] = vaddq_s16(s2[10], s2[11]); 149 s3[12] = vaddq_s16(s2[12], s2[13]); 150 s3[13] = vsubq_s16(s2[12], s2[13]); 151 152 s3[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], 153 cospi_28_64); 154 s3[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], 155 cospi_4_64); 156 157 s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64, 158 s2[29], -cospi_4_64); 159 s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29], 160 cospi_28_64); 161 162 s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64, 163 s2[26], cospi_12_64); 164 s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26], 165 cospi_20_64); 166 167 s3[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, 168 s1[24], -cospi_20_64); 169 s3[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, 170 s1[24], cospi_12_64); 171 172 // stage 4 173 s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); 174 s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64); 175 s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64); 176 177 s4[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], 178 cospi_24_64); 179 s4[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], 180 cospi_8_64); 181 182 s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64, 183 s3[13], -cospi_8_64); 184 s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13], 185 cospi_24_64); 186 187 s4[16] = vaddq_s16(s1[16], s2[19]); 188 s4[17] = vaddq_s16(s3[17], s3[18]); 189 s4[18] = vsubq_s16(s3[17], s3[18]); 190 s4[19] = vsubq_s16(s1[16], s2[19]); 191 s4[20] = vsubq_s16(s1[23], s2[20]); 192 s4[21] = vsubq_s16(s3[22], s3[21]); 193 s4[22] = vaddq_s16(s3[21], s3[22]); 194 s4[23] = vaddq_s16(s2[20], s1[23]); 195 s4[24] = vaddq_s16(s1[24], s2[27]); 196 s4[25] = vaddq_s16(s3[25], s3[26]); 197 s4[26] = vsubq_s16(s3[25], s3[26]); 198 s4[27] = vsubq_s16(s1[24], s2[27]); 199 s4[28] = vsubq_s16(s1[31], s2[28]); 200 s4[29] = vsubq_s16(s3[30], s3[29]); 201 s4[30] = vaddq_s16(s3[29], s3[30]); 202 s4[31] = vaddq_s16(s2[28], s1[31]); 203 204 // stage 5 205 s5[0] = vaddq_s16(s4[0], s4[3]); 206 s5[1] = vaddq_s16(s4[0], s4[2]); 207 s5[2] = vsubq_s16(s4[0], s4[2]); 208 s5[3] = vsubq_s16(s4[0], s4[3]); 209 210 s5[5] = sub_multiply_shift_and_narrow_s16(s3[7], s3[4], cospi_16_64); 211 s5[6] = add_multiply_shift_and_narrow_s16(s3[4], s3[7], cospi_16_64); 212 213 s5[8] = vaddq_s16(s2[8], s3[11]); 214 s5[9] = vaddq_s16(s4[9], s4[10]); 215 s5[10] = vsubq_s16(s4[9], s4[10]); 216 s5[11] = vsubq_s16(s2[8], s3[11]); 217 s5[12] = vsubq_s16(s2[15], s3[12]); 218 s5[13] = vsubq_s16(s4[14], s4[13]); 219 s5[14] = vaddq_s16(s4[13], s4[14]); 220 s5[15] = vaddq_s16(s2[15], s3[12]); 221 222 s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29], 223 cospi_24_64); 224 s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29], 225 cospi_8_64); 226 227 s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28], 228 cospi_24_64); 229 s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28], 230 cospi_8_64); 231 232 s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64, 233 s4[27], -cospi_8_64); 234 s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27], 235 cospi_24_64); 236 237 s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64, 238 s4[26], -cospi_8_64); 239 s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26], 240 cospi_24_64); 241 242 // stage 6 243 s6[0] = vaddq_s16(s5[0], s3[7]); 244 s6[1] = vaddq_s16(s5[1], s5[6]); 245 s6[2] = vaddq_s16(s5[2], s5[5]); 246 s6[3] = vaddq_s16(s5[3], s3[4]); 247 s6[4] = vsubq_s16(s5[3], s3[4]); 248 s6[5] = vsubq_s16(s5[2], s5[5]); 249 s6[6] = vsubq_s16(s5[1], s5[6]); 250 s6[7] = vsubq_s16(s5[0], s3[7]); 251 252 s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64); 253 s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64); 254 255 s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64); 256 s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64); 257 258 s6[16] = vaddq_s16(s4[16], s4[23]); 259 s6[17] = vaddq_s16(s4[17], s4[22]); 260 s6[18] = vaddq_s16(s5[18], s5[21]); 261 s6[19] = vaddq_s16(s5[19], s5[20]); 262 s6[20] = vsubq_s16(s5[19], s5[20]); 263 s6[21] = vsubq_s16(s5[18], s5[21]); 264 s6[22] = vsubq_s16(s4[17], s4[22]); 265 s6[23] = vsubq_s16(s4[16], s4[23]); 266 267 s6[24] = vsubq_s16(s4[31], s4[24]); 268 s6[25] = vsubq_s16(s4[30], s4[25]); 269 s6[26] = vsubq_s16(s5[29], s5[26]); 270 s6[27] = vsubq_s16(s5[28], s5[27]); 271 s6[28] = vaddq_s16(s5[27], s5[28]); 272 s6[29] = vaddq_s16(s5[26], s5[29]); 273 s6[30] = vaddq_s16(s4[25], s4[30]); 274 s6[31] = vaddq_s16(s4[24], s4[31]); 275 276 // stage 7 277 s7[0] = vaddq_s16(s6[0], s5[15]); 278 s7[1] = vaddq_s16(s6[1], s5[14]); 279 s7[2] = vaddq_s16(s6[2], s6[13]); 280 s7[3] = vaddq_s16(s6[3], s6[12]); 281 s7[4] = vaddq_s16(s6[4], s6[11]); 282 s7[5] = vaddq_s16(s6[5], s6[10]); 283 s7[6] = vaddq_s16(s6[6], s5[9]); 284 s7[7] = vaddq_s16(s6[7], s5[8]); 285 s7[8] = vsubq_s16(s6[7], s5[8]); 286 s7[9] = vsubq_s16(s6[6], s5[9]); 287 s7[10] = vsubq_s16(s6[5], s6[10]); 288 s7[11] = vsubq_s16(s6[4], s6[11]); 289 s7[12] = vsubq_s16(s6[3], s6[12]); 290 s7[13] = vsubq_s16(s6[2], s6[13]); 291 s7[14] = vsubq_s16(s6[1], s5[14]); 292 s7[15] = vsubq_s16(s6[0], s5[15]); 293 294 s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64); 295 s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64); 296 297 s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64); 298 s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64); 299 300 s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64); 301 s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64); 302 303 s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64); 304 s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64); 305 306 // final stage 307 vst1q_s16(output, vaddq_s16(s7[0], s6[31])); 308 output += 16; 309 vst1q_s16(output, vaddq_s16(s7[1], s6[30])); 310 output += 16; 311 vst1q_s16(output, vaddq_s16(s7[2], s6[29])); 312 output += 16; 313 vst1q_s16(output, vaddq_s16(s7[3], s6[28])); 314 output += 16; 315 vst1q_s16(output, vaddq_s16(s7[4], s7[27])); 316 output += 16; 317 vst1q_s16(output, vaddq_s16(s7[5], s7[26])); 318 output += 16; 319 vst1q_s16(output, vaddq_s16(s7[6], s7[25])); 320 output += 16; 321 vst1q_s16(output, vaddq_s16(s7[7], s7[24])); 322 output += 16; 323 324 vst1q_s16(output, vaddq_s16(s7[8], s7[23])); 325 output += 16; 326 vst1q_s16(output, vaddq_s16(s7[9], s7[22])); 327 output += 16; 328 vst1q_s16(output, vaddq_s16(s7[10], s7[21])); 329 output += 16; 330 vst1q_s16(output, vaddq_s16(s7[11], s7[20])); 331 output += 16; 332 vst1q_s16(output, vaddq_s16(s7[12], s6[19])); 333 output += 16; 334 vst1q_s16(output, vaddq_s16(s7[13], s6[18])); 335 output += 16; 336 vst1q_s16(output, vaddq_s16(s7[14], s6[17])); 337 output += 16; 338 vst1q_s16(output, vaddq_s16(s7[15], s6[16])); 339 output += 16; 340 341 vst1q_s16(output, vsubq_s16(s7[15], s6[16])); 342 output += 16; 343 vst1q_s16(output, vsubq_s16(s7[14], s6[17])); 344 output += 16; 345 vst1q_s16(output, vsubq_s16(s7[13], s6[18])); 346 output += 16; 347 vst1q_s16(output, vsubq_s16(s7[12], s6[19])); 348 output += 16; 349 vst1q_s16(output, vsubq_s16(s7[11], s7[20])); 350 output += 16; 351 vst1q_s16(output, vsubq_s16(s7[10], s7[21])); 352 output += 16; 353 vst1q_s16(output, vsubq_s16(s7[9], s7[22])); 354 output += 16; 355 vst1q_s16(output, vsubq_s16(s7[8], s7[23])); 356 output += 16; 357 358 vst1q_s16(output, vsubq_s16(s7[7], s7[24])); 359 output += 16; 360 vst1q_s16(output, vsubq_s16(s7[6], s7[25])); 361 output += 16; 362 vst1q_s16(output, vsubq_s16(s7[5], s7[26])); 363 output += 16; 364 vst1q_s16(output, vsubq_s16(s7[4], s7[27])); 365 output += 16; 366 vst1q_s16(output, vsubq_s16(s7[3], s6[28])); 367 output += 16; 368 vst1q_s16(output, vsubq_s16(s7[2], s6[29])); 369 output += 16; 370 vst1q_s16(output, vsubq_s16(s7[1], s6[30])); 371 output += 16; 372 vst1q_s16(output, vsubq_s16(s7[0], s6[31])); 373} 374 375void vpx_idct32_16_neon(const int16_t *const input, void *const output, 376 const int stride, const int highbd_flag) { 377 int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], 378 out[32]; 379 380 load_and_transpose_s16_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4], 381 &in[5], &in[6], &in[7]); 382 383 load_and_transpose_s16_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11], 384 &in[12], &in[13], &in[14], &in[15]); 385 386 // stage 1 387 s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); 388 s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); 389 390 s1[17] = multiply_shift_and_narrow_s16(in[15], -cospi_17_64); 391 s1[30] = multiply_shift_and_narrow_s16(in[15], cospi_15_64); 392 393 s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64); 394 s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64); 395 396 s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); 397 s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); 398 399 s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); 400 s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); 401 402 s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64); 403 s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64); 404 405 s1[22] = multiply_shift_and_narrow_s16(in[13], cospi_19_64); 406 s1[25] = multiply_shift_and_narrow_s16(in[13], cospi_13_64); 407 408 s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); 409 s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); 410 411 // stage 2 412 s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); 413 s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); 414 415 s2[9] = multiply_shift_and_narrow_s16(in[14], -cospi_18_64); 416 s2[14] = multiply_shift_and_narrow_s16(in[14], cospi_14_64); 417 418 s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64); 419 s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64); 420 421 s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); 422 s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); 423 424 s2[16] = vaddq_s16(s1[16], s1[17]); 425 s2[17] = vsubq_s16(s1[16], s1[17]); 426 s2[18] = vsubq_s16(s1[19], s1[18]); 427 s2[19] = vaddq_s16(s1[18], s1[19]); 428 s2[20] = vaddq_s16(s1[20], s1[21]); 429 s2[21] = vsubq_s16(s1[20], s1[21]); 430 s2[22] = vsubq_s16(s1[23], s1[22]); 431 s2[23] = vaddq_s16(s1[22], s1[23]); 432 s2[24] = vaddq_s16(s1[24], s1[25]); 433 s2[25] = vsubq_s16(s1[24], s1[25]); 434 s2[26] = vsubq_s16(s1[27], s1[26]); 435 s2[27] = vaddq_s16(s1[26], s1[27]); 436 s2[28] = vaddq_s16(s1[28], s1[29]); 437 s2[29] = vsubq_s16(s1[28], s1[29]); 438 s2[30] = vsubq_s16(s1[31], s1[30]); 439 s2[31] = vaddq_s16(s1[30], s1[31]); 440 441 // stage 3 442 s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); 443 s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); 444 445 s3[5] = multiply_shift_and_narrow_s16(in[12], -cospi_20_64); 446 s3[6] = multiply_shift_and_narrow_s16(in[12], cospi_12_64); 447 448 s3[8] = vaddq_s16(s2[8], s2[9]); 449 s3[9] = vsubq_s16(s2[8], s2[9]); 450 s3[10] = vsubq_s16(s2[11], s2[10]); 451 s3[11] = vaddq_s16(s2[10], s2[11]); 452 s3[12] = vaddq_s16(s2[12], s2[13]); 453 s3[13] = vsubq_s16(s2[12], s2[13]); 454 s3[14] = vsubq_s16(s2[15], s2[14]); 455 s3[15] = vaddq_s16(s2[14], s2[15]); 456 457 s3[17] = multiply_accumulate_shift_and_narrow_s16(s2[17], -cospi_4_64, s2[30], 458 cospi_28_64); 459 s3[30] = multiply_accumulate_shift_and_narrow_s16(s2[17], cospi_28_64, s2[30], 460 cospi_4_64); 461 462 s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64, 463 s2[29], -cospi_4_64); 464 s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29], 465 cospi_28_64); 466 467 s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64, 468 s2[26], cospi_12_64); 469 s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26], 470 cospi_20_64); 471 472 s3[22] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_12_64, 473 s2[25], -cospi_20_64); 474 s3[25] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_20_64, 475 s2[25], cospi_12_64); 476 477 // stage 4 478 s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); 479 s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64); 480 s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64); 481 482 s4[4] = vaddq_s16(s3[4], s3[5]); 483 s4[5] = vsubq_s16(s3[4], s3[5]); 484 s4[6] = vsubq_s16(s3[7], s3[6]); 485 s4[7] = vaddq_s16(s3[6], s3[7]); 486 487 s4[9] = multiply_accumulate_shift_and_narrow_s16(s3[9], -cospi_8_64, s3[14], 488 cospi_24_64); 489 s4[14] = multiply_accumulate_shift_and_narrow_s16(s3[9], cospi_24_64, s3[14], 490 cospi_8_64); 491 492 s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64, 493 s3[13], -cospi_8_64); 494 s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13], 495 cospi_24_64); 496 497 s4[16] = vaddq_s16(s2[16], s2[19]); 498 s4[17] = vaddq_s16(s3[17], s3[18]); 499 s4[18] = vsubq_s16(s3[17], s3[18]); 500 s4[19] = vsubq_s16(s2[16], s2[19]); 501 s4[20] = vsubq_s16(s2[23], s2[20]); 502 s4[21] = vsubq_s16(s3[22], s3[21]); 503 s4[22] = vaddq_s16(s3[21], s3[22]); 504 s4[23] = vaddq_s16(s2[20], s2[23]); 505 s4[24] = vaddq_s16(s2[24], s2[27]); 506 s4[25] = vaddq_s16(s3[25], s3[26]); 507 s4[26] = vsubq_s16(s3[25], s3[26]); 508 s4[27] = vsubq_s16(s2[24], s2[27]); 509 s4[28] = vsubq_s16(s2[31], s2[28]); 510 s4[29] = vsubq_s16(s3[30], s3[29]); 511 s4[30] = vaddq_s16(s3[29], s3[30]); 512 s4[31] = vaddq_s16(s2[28], s2[31]); 513 514 // stage 5 515 s5[0] = vaddq_s16(s4[0], s4[3]); 516 s5[1] = vaddq_s16(s4[0], s4[2]); 517 s5[2] = vsubq_s16(s4[0], s4[2]); 518 s5[3] = vsubq_s16(s4[0], s4[3]); 519 520 s5[5] = sub_multiply_shift_and_narrow_s16(s4[6], s4[5], cospi_16_64); 521 s5[6] = add_multiply_shift_and_narrow_s16(s4[5], s4[6], cospi_16_64); 522 523 s5[8] = vaddq_s16(s3[8], s3[11]); 524 s5[9] = vaddq_s16(s4[9], s4[10]); 525 s5[10] = vsubq_s16(s4[9], s4[10]); 526 s5[11] = vsubq_s16(s3[8], s3[11]); 527 s5[12] = vsubq_s16(s3[15], s3[12]); 528 s5[13] = vsubq_s16(s4[14], s4[13]); 529 s5[14] = vaddq_s16(s4[13], s4[14]); 530 s5[15] = vaddq_s16(s3[15], s3[12]); 531 532 s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29], 533 cospi_24_64); 534 s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29], 535 cospi_8_64); 536 537 s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28], 538 cospi_24_64); 539 s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28], 540 cospi_8_64); 541 542 s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64, 543 s4[27], -cospi_8_64); 544 s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27], 545 cospi_24_64); 546 547 s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64, 548 s4[26], -cospi_8_64); 549 s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26], 550 cospi_24_64); 551 552 // stage 6 553 s6[0] = vaddq_s16(s5[0], s4[7]); 554 s6[1] = vaddq_s16(s5[1], s5[6]); 555 s6[2] = vaddq_s16(s5[2], s5[5]); 556 s6[3] = vaddq_s16(s5[3], s4[4]); 557 s6[4] = vsubq_s16(s5[3], s4[4]); 558 s6[5] = vsubq_s16(s5[2], s5[5]); 559 s6[6] = vsubq_s16(s5[1], s5[6]); 560 s6[7] = vsubq_s16(s5[0], s4[7]); 561 562 s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64); 563 s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64); 564 565 s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64); 566 s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64); 567 568 s6[16] = vaddq_s16(s4[16], s4[23]); 569 s6[17] = vaddq_s16(s4[17], s4[22]); 570 s6[18] = vaddq_s16(s5[18], s5[21]); 571 s6[19] = vaddq_s16(s5[19], s5[20]); 572 s6[20] = vsubq_s16(s5[19], s5[20]); 573 s6[21] = vsubq_s16(s5[18], s5[21]); 574 s6[22] = vsubq_s16(s4[17], s4[22]); 575 s6[23] = vsubq_s16(s4[16], s4[23]); 576 s6[24] = vsubq_s16(s4[31], s4[24]); 577 s6[25] = vsubq_s16(s4[30], s4[25]); 578 s6[26] = vsubq_s16(s5[29], s5[26]); 579 s6[27] = vsubq_s16(s5[28], s5[27]); 580 s6[28] = vaddq_s16(s5[27], s5[28]); 581 s6[29] = vaddq_s16(s5[26], s5[29]); 582 s6[30] = vaddq_s16(s4[25], s4[30]); 583 s6[31] = vaddq_s16(s4[24], s4[31]); 584 585 // stage 7 586 s7[0] = vaddq_s16(s6[0], s5[15]); 587 s7[1] = vaddq_s16(s6[1], s5[14]); 588 s7[2] = vaddq_s16(s6[2], s6[13]); 589 s7[3] = vaddq_s16(s6[3], s6[12]); 590 s7[4] = vaddq_s16(s6[4], s6[11]); 591 s7[5] = vaddq_s16(s6[5], s6[10]); 592 s7[6] = vaddq_s16(s6[6], s5[9]); 593 s7[7] = vaddq_s16(s6[7], s5[8]); 594 s7[8] = vsubq_s16(s6[7], s5[8]); 595 s7[9] = vsubq_s16(s6[6], s5[9]); 596 s7[10] = vsubq_s16(s6[5], s6[10]); 597 s7[11] = vsubq_s16(s6[4], s6[11]); 598 s7[12] = vsubq_s16(s6[3], s6[12]); 599 s7[13] = vsubq_s16(s6[2], s6[13]); 600 s7[14] = vsubq_s16(s6[1], s5[14]); 601 s7[15] = vsubq_s16(s6[0], s5[15]); 602 603 s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64); 604 s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64); 605 606 s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64); 607 s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64); 608 609 s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64); 610 s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64); 611 612 s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64); 613 s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64); 614 615 // final stage 616 out[0] = final_add(s7[0], s6[31]); 617 out[1] = final_add(s7[1], s6[30]); 618 out[2] = final_add(s7[2], s6[29]); 619 out[3] = final_add(s7[3], s6[28]); 620 out[4] = final_add(s7[4], s7[27]); 621 out[5] = final_add(s7[5], s7[26]); 622 out[6] = final_add(s7[6], s7[25]); 623 out[7] = final_add(s7[7], s7[24]); 624 out[8] = final_add(s7[8], s7[23]); 625 out[9] = final_add(s7[9], s7[22]); 626 out[10] = final_add(s7[10], s7[21]); 627 out[11] = final_add(s7[11], s7[20]); 628 out[12] = final_add(s7[12], s6[19]); 629 out[13] = final_add(s7[13], s6[18]); 630 out[14] = final_add(s7[14], s6[17]); 631 out[15] = final_add(s7[15], s6[16]); 632 out[16] = final_sub(s7[15], s6[16]); 633 out[17] = final_sub(s7[14], s6[17]); 634 out[18] = final_sub(s7[13], s6[18]); 635 out[19] = final_sub(s7[12], s6[19]); 636 out[20] = final_sub(s7[11], s7[20]); 637 out[21] = final_sub(s7[10], s7[21]); 638 out[22] = final_sub(s7[9], s7[22]); 639 out[23] = final_sub(s7[8], s7[23]); 640 out[24] = final_sub(s7[7], s7[24]); 641 out[25] = final_sub(s7[6], s7[25]); 642 out[26] = final_sub(s7[5], s7[26]); 643 out[27] = final_sub(s7[4], s7[27]); 644 out[28] = final_sub(s7[3], s6[28]); 645 out[29] = final_sub(s7[2], s6[29]); 646 out[30] = final_sub(s7[1], s6[30]); 647 out[31] = final_sub(s7[0], s6[31]); 648 649 if (highbd_flag) { 650 highbd_add_and_store_bd8(out, output, stride); 651 } else { 652 uint8_t *const outputT = (uint8_t *)output; 653 add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6], 654 out[7], outputT, stride); 655 add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13], 656 out[14], out[15], outputT + (8 * stride), stride); 657 add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21], 658 out[22], out[23], outputT + (16 * stride), stride); 659 add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29], 660 out[30], out[31], outputT + (24 * stride), stride); 661 } 662} 663 664void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, 665 int stride) { 666 int i; 667 int16_t temp[32 * 16]; 668 int16_t *t = temp; 669 670 vpx_idct32_12_neon(input, temp); 671 vpx_idct32_12_neon(input + 32 * 8, temp + 8); 672 673 for (i = 0; i < 32; i += 8) { 674 vpx_idct32_16_neon(t, dest, stride, 0); 675 t += (16 * 8); 676 dest += 8; 677 } 678} 679