1/* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13#include "./vpx_config.h" 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/txfm_common.h" 16#include "vpx_dsp/arm/mem_neon.h" 17#include "vpx_dsp/arm/transpose_neon.h" 18 19// Most gcc 4.9 distributions outside of Android do not generate correct code 20// for this function. 21#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \ 22 __GNUC__ == 4 && __GNUC_MINOR__ <= 9 23 24void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { 25 vpx_fdct32x32_c(input, output, stride); 26} 27 28void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, 29 int stride) { 30 vpx_fdct32x32_rd_c(input, output, stride); 31} 32 33#else 34 35#define LOAD_INCREMENT(src, stride, dest, index) \ 36 do { \ 37 dest[index] = vld1q_s16(src); \ 38 src += stride; \ 39 } while (0) 40 41#define ADD_S16(src, index0, index1, dest, index3) \ 42 do { \ 43 dest[index3] = vaddq_s16(src[index0], src[index1]); \ 44 } while (0) 45 46#define ADD_SHIFT_S16(src, index0, index1) \ 47 do { \ 48 src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \ 49 } while (0) 50 51// Load, cross, and multiply by 4. Load the first 8 and last 8, then the 52// middle 53// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better? 54static INLINE void load(const int16_t *a, int stride, int16x8_t *b) { 55 const int16_t *a_end = a + 24 * stride; 56 int16x8_t c[8]; 57 58 LOAD_INCREMENT(a, stride, b, 0); 59 LOAD_INCREMENT(a, stride, b, 1); 60 LOAD_INCREMENT(a, stride, b, 2); 61 LOAD_INCREMENT(a, stride, b, 3); 62 LOAD_INCREMENT(a, stride, b, 4); 63 LOAD_INCREMENT(a, stride, b, 5); 64 LOAD_INCREMENT(a, stride, b, 6); 65 LOAD_INCREMENT(a, stride, b, 7); 66 67 LOAD_INCREMENT(a_end, stride, b, 24); 68 LOAD_INCREMENT(a_end, stride, b, 25); 69 LOAD_INCREMENT(a_end, stride, b, 26); 70 LOAD_INCREMENT(a_end, stride, b, 27); 71 LOAD_INCREMENT(a_end, stride, b, 28); 72 LOAD_INCREMENT(a_end, stride, b, 29); 73 LOAD_INCREMENT(a_end, stride, b, 30); 74 LOAD_INCREMENT(a_end, stride, b, 31); 75 76 ADD_S16(b, 0, 31, c, 0); 77 ADD_S16(b, 1, 30, c, 1); 78 ADD_S16(b, 2, 29, c, 2); 79 ADD_S16(b, 3, 28, c, 3); 80 ADD_S16(b, 4, 27, c, 4); 81 ADD_S16(b, 5, 26, c, 5); 82 ADD_S16(b, 6, 25, c, 6); 83 ADD_S16(b, 7, 24, c, 7); 84 85 ADD_SHIFT_S16(b, 7, 24); 86 ADD_SHIFT_S16(b, 6, 25); 87 ADD_SHIFT_S16(b, 5, 26); 88 ADD_SHIFT_S16(b, 4, 27); 89 ADD_SHIFT_S16(b, 3, 28); 90 ADD_SHIFT_S16(b, 2, 29); 91 ADD_SHIFT_S16(b, 1, 30); 92 ADD_SHIFT_S16(b, 0, 31); 93 94 b[0] = vshlq_n_s16(c[0], 2); 95 b[1] = vshlq_n_s16(c[1], 2); 96 b[2] = vshlq_n_s16(c[2], 2); 97 b[3] = vshlq_n_s16(c[3], 2); 98 b[4] = vshlq_n_s16(c[4], 2); 99 b[5] = vshlq_n_s16(c[5], 2); 100 b[6] = vshlq_n_s16(c[6], 2); 101 b[7] = vshlq_n_s16(c[7], 2); 102 103 LOAD_INCREMENT(a, stride, b, 8); 104 LOAD_INCREMENT(a, stride, b, 9); 105 LOAD_INCREMENT(a, stride, b, 10); 106 LOAD_INCREMENT(a, stride, b, 11); 107 LOAD_INCREMENT(a, stride, b, 12); 108 LOAD_INCREMENT(a, stride, b, 13); 109 LOAD_INCREMENT(a, stride, b, 14); 110 LOAD_INCREMENT(a, stride, b, 15); 111 LOAD_INCREMENT(a, stride, b, 16); 112 LOAD_INCREMENT(a, stride, b, 17); 113 LOAD_INCREMENT(a, stride, b, 18); 114 LOAD_INCREMENT(a, stride, b, 19); 115 LOAD_INCREMENT(a, stride, b, 20); 116 LOAD_INCREMENT(a, stride, b, 21); 117 LOAD_INCREMENT(a, stride, b, 22); 118 LOAD_INCREMENT(a, stride, b, 23); 119 120 ADD_S16(b, 8, 23, c, 0); 121 ADD_S16(b, 9, 22, c, 1); 122 ADD_S16(b, 10, 21, c, 2); 123 ADD_S16(b, 11, 20, c, 3); 124 ADD_S16(b, 12, 19, c, 4); 125 ADD_S16(b, 13, 18, c, 5); 126 ADD_S16(b, 14, 17, c, 6); 127 ADD_S16(b, 15, 16, c, 7); 128 129 ADD_SHIFT_S16(b, 15, 16); 130 ADD_SHIFT_S16(b, 14, 17); 131 ADD_SHIFT_S16(b, 13, 18); 132 ADD_SHIFT_S16(b, 12, 19); 133 ADD_SHIFT_S16(b, 11, 20); 134 ADD_SHIFT_S16(b, 10, 21); 135 ADD_SHIFT_S16(b, 9, 22); 136 ADD_SHIFT_S16(b, 8, 23); 137 138 b[8] = vshlq_n_s16(c[0], 2); 139 b[9] = vshlq_n_s16(c[1], 2); 140 b[10] = vshlq_n_s16(c[2], 2); 141 b[11] = vshlq_n_s16(c[3], 2); 142 b[12] = vshlq_n_s16(c[4], 2); 143 b[13] = vshlq_n_s16(c[5], 2); 144 b[14] = vshlq_n_s16(c[6], 2); 145 b[15] = vshlq_n_s16(c[7], 2); 146} 147 148#undef LOAD_INCREMENT 149#undef ADD_S16 150#undef ADD_SHIFT_S16 151 152#define STORE_S16(src, index, dest) \ 153 do { \ 154 store_s16q_to_tran_low(dest, src[index]); \ 155 dest += 8; \ 156 } while (0); 157 158// Store 32 16x8 values, assuming stride == 32. 159// Slight twist: store horizontally in blocks of 8. 160static INLINE void store(tran_low_t *a, const int16x8_t *b) { 161 STORE_S16(b, 0, a); 162 STORE_S16(b, 8, a); 163 STORE_S16(b, 16, a); 164 STORE_S16(b, 24, a); 165 STORE_S16(b, 1, a); 166 STORE_S16(b, 9, a); 167 STORE_S16(b, 17, a); 168 STORE_S16(b, 25, a); 169 STORE_S16(b, 2, a); 170 STORE_S16(b, 10, a); 171 STORE_S16(b, 18, a); 172 STORE_S16(b, 26, a); 173 STORE_S16(b, 3, a); 174 STORE_S16(b, 11, a); 175 STORE_S16(b, 19, a); 176 STORE_S16(b, 27, a); 177 STORE_S16(b, 4, a); 178 STORE_S16(b, 12, a); 179 STORE_S16(b, 20, a); 180 STORE_S16(b, 28, a); 181 STORE_S16(b, 5, a); 182 STORE_S16(b, 13, a); 183 STORE_S16(b, 21, a); 184 STORE_S16(b, 29, a); 185 STORE_S16(b, 6, a); 186 STORE_S16(b, 14, a); 187 STORE_S16(b, 22, a); 188 STORE_S16(b, 30, a); 189 STORE_S16(b, 7, a); 190 STORE_S16(b, 15, a); 191 STORE_S16(b, 23, a); 192 STORE_S16(b, 31, a); 193} 194 195#undef STORE_S16 196 197// fdct_round_shift((a +/- b) * c) 198static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, 199 const tran_high_t constant, 200 int16x8_t *add, int16x8_t *sub) { 201 const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); 202 const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); 203 const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); 204 const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); 205 const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); 206 const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); 207 const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); 208 const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); 209 const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); 210 const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); 211 *add = vcombine_s16(rounded0, rounded1); 212 *sub = vcombine_s16(rounded2, rounded3); 213} 214 215// fdct_round_shift(a * c0 +/- b * c1) 216static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, 217 const tran_coef_t constant0, 218 const tran_coef_t constant1, 219 int16x8_t *add, int16x8_t *sub) { 220 const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0); 221 const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0); 222 const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1); 223 const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1); 224 const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0); 225 const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0); 226 const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1); 227 const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1); 228 const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); 229 const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); 230 const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); 231 const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); 232 *add = vcombine_s16(rounded0, rounded1); 233 *sub = vcombine_s16(rounded2, rounded3); 234} 235 236// Add 2 if positive, 1 if negative, and shift by 2. 237// In practice, subtract the sign bit, then shift with rounding. 238static INLINE int16x8_t sub_round_shift(const int16x8_t a) { 239 const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); 240 const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); 241 const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); 242 return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); 243} 244 245static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { 246 int16x8_t a[32]; 247 int16x8_t b[32]; 248 249 // Stage 1: Done as part of the load. 250 251 // Stage 2. 252 // Mini cross. X the first 16 values and the middle 8 of the second half. 253 a[0] = vaddq_s16(in[0], in[15]); 254 a[1] = vaddq_s16(in[1], in[14]); 255 a[2] = vaddq_s16(in[2], in[13]); 256 a[3] = vaddq_s16(in[3], in[12]); 257 a[4] = vaddq_s16(in[4], in[11]); 258 a[5] = vaddq_s16(in[5], in[10]); 259 a[6] = vaddq_s16(in[6], in[9]); 260 a[7] = vaddq_s16(in[7], in[8]); 261 262 a[8] = vsubq_s16(in[7], in[8]); 263 a[9] = vsubq_s16(in[6], in[9]); 264 a[10] = vsubq_s16(in[5], in[10]); 265 a[11] = vsubq_s16(in[4], in[11]); 266 a[12] = vsubq_s16(in[3], in[12]); 267 a[13] = vsubq_s16(in[2], in[13]); 268 a[14] = vsubq_s16(in[1], in[14]); 269 a[15] = vsubq_s16(in[0], in[15]); 270 271 a[16] = in[16]; 272 a[17] = in[17]; 273 a[18] = in[18]; 274 a[19] = in[19]; 275 276 butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]); 277 butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]); 278 butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]); 279 butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]); 280 281 a[28] = in[28]; 282 a[29] = in[29]; 283 a[30] = in[30]; 284 a[31] = in[31]; 285 286 // Stage 3. 287 b[0] = vaddq_s16(a[0], a[7]); 288 b[1] = vaddq_s16(a[1], a[6]); 289 b[2] = vaddq_s16(a[2], a[5]); 290 b[3] = vaddq_s16(a[3], a[4]); 291 292 b[4] = vsubq_s16(a[3], a[4]); 293 b[5] = vsubq_s16(a[2], a[5]); 294 b[6] = vsubq_s16(a[1], a[6]); 295 b[7] = vsubq_s16(a[0], a[7]); 296 297 b[8] = a[8]; 298 b[9] = a[9]; 299 300 butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]); 301 butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]); 302 303 b[14] = a[14]; 304 b[15] = a[15]; 305 306 b[16] = vaddq_s16(in[16], a[23]); 307 b[17] = vaddq_s16(in[17], a[22]); 308 b[18] = vaddq_s16(in[18], a[21]); 309 b[19] = vaddq_s16(in[19], a[20]); 310 311 b[20] = vsubq_s16(in[19], a[20]); 312 b[21] = vsubq_s16(in[18], a[21]); 313 b[22] = vsubq_s16(in[17], a[22]); 314 b[23] = vsubq_s16(in[16], a[23]); 315 316 b[24] = vsubq_s16(in[31], a[24]); 317 b[25] = vsubq_s16(in[30], a[25]); 318 b[26] = vsubq_s16(in[29], a[26]); 319 b[27] = vsubq_s16(in[28], a[27]); 320 321 b[28] = vaddq_s16(in[28], a[27]); 322 b[29] = vaddq_s16(in[29], a[26]); 323 b[30] = vaddq_s16(in[30], a[25]); 324 b[31] = vaddq_s16(in[31], a[24]); 325 326 // Stage 4. 327 a[0] = vaddq_s16(b[0], b[3]); 328 a[1] = vaddq_s16(b[1], b[2]); 329 a[2] = vsubq_s16(b[1], b[2]); 330 a[3] = vsubq_s16(b[0], b[3]); 331 332 a[4] = b[4]; 333 334 butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]); 335 336 a[7] = b[7]; 337 338 a[8] = vaddq_s16(b[8], b[11]); 339 a[9] = vaddq_s16(b[9], b[10]); 340 a[10] = vsubq_s16(b[9], b[10]); 341 a[11] = vsubq_s16(b[8], b[11]); 342 a[12] = vsubq_s16(b[15], b[12]); 343 a[13] = vsubq_s16(b[14], b[13]); 344 a[14] = vaddq_s16(b[14], b[13]); 345 a[15] = vaddq_s16(b[15], b[12]); 346 347 a[16] = b[16]; 348 a[17] = b[17]; 349 350 butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]); 351 butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]); 352 butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]); 353 butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]); 354 355 a[22] = b[22]; 356 a[23] = b[23]; 357 a[24] = b[24]; 358 a[25] = b[25]; 359 360 a[30] = b[30]; 361 a[31] = b[31]; 362 363 // Stage 5. 364 butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]); 365 butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]); 366 367 b[4] = vaddq_s16(a[4], a[5]); 368 b[5] = vsubq_s16(a[4], a[5]); 369 b[6] = vsubq_s16(a[7], a[6]); 370 b[7] = vaddq_s16(a[7], a[6]); 371 372 b[8] = a[8]; 373 374 butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]); 375 butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]); 376 377 b[11] = a[11]; 378 b[12] = a[12]; 379 380 b[15] = a[15]; 381 382 b[16] = vaddq_s16(a[19], a[16]); 383 b[17] = vaddq_s16(a[18], a[17]); 384 b[18] = vsubq_s16(a[17], a[18]); 385 b[19] = vsubq_s16(a[16], a[19]); 386 b[20] = vsubq_s16(a[23], a[20]); 387 b[21] = vsubq_s16(a[22], a[21]); 388 b[22] = vaddq_s16(a[21], a[22]); 389 b[23] = vaddq_s16(a[20], a[23]); 390 b[24] = vaddq_s16(a[27], a[24]); 391 b[25] = vaddq_s16(a[26], a[25]); 392 b[26] = vsubq_s16(a[25], a[26]); 393 b[27] = vsubq_s16(a[24], a[27]); 394 b[28] = vsubq_s16(a[31], a[28]); 395 b[29] = vsubq_s16(a[30], a[29]); 396 b[30] = vaddq_s16(a[29], a[30]); 397 b[31] = vaddq_s16(a[28], a[31]); 398 399 // Stage 6. 400 a[0] = b[0]; 401 a[1] = b[1]; 402 a[2] = b[2]; 403 a[3] = b[3]; 404 405 butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]); 406 butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]); 407 408 a[8] = vaddq_s16(b[8], b[9]); 409 a[9] = vsubq_s16(b[8], b[9]); 410 a[10] = vsubq_s16(b[11], b[10]); 411 a[11] = vaddq_s16(b[11], b[10]); 412 a[12] = vaddq_s16(b[12], b[13]); 413 a[13] = vsubq_s16(b[12], b[13]); 414 a[14] = vsubq_s16(b[15], b[14]); 415 a[15] = vaddq_s16(b[15], b[14]); 416 417 a[16] = b[16]; 418 a[19] = b[19]; 419 a[20] = b[20]; 420 a[23] = b[23]; 421 a[24] = b[24]; 422 a[27] = b[27]; 423 a[28] = b[28]; 424 a[31] = b[31]; 425 426 butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]); 427 butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]); 428 429 butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]); 430 butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]); 431 432 // Stage 7. 433 b[0] = a[0]; 434 b[1] = a[1]; 435 b[2] = a[2]; 436 b[3] = a[3]; 437 b[4] = a[4]; 438 b[5] = a[5]; 439 b[6] = a[6]; 440 b[7] = a[7]; 441 442 butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]); 443 butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]); 444 butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]); 445 butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]); 446 447 b[16] = vaddq_s16(a[16], a[17]); 448 b[17] = vsubq_s16(a[16], a[17]); 449 b[18] = vsubq_s16(a[19], a[18]); 450 b[19] = vaddq_s16(a[19], a[18]); 451 b[20] = vaddq_s16(a[20], a[21]); 452 b[21] = vsubq_s16(a[20], a[21]); 453 b[22] = vsubq_s16(a[23], a[22]); 454 b[23] = vaddq_s16(a[23], a[22]); 455 b[24] = vaddq_s16(a[24], a[25]); 456 b[25] = vsubq_s16(a[24], a[25]); 457 b[26] = vsubq_s16(a[27], a[26]); 458 b[27] = vaddq_s16(a[27], a[26]); 459 b[28] = vaddq_s16(a[28], a[29]); 460 b[29] = vsubq_s16(a[28], a[29]); 461 b[30] = vsubq_s16(a[31], a[30]); 462 b[31] = vaddq_s16(a[31], a[30]); 463 464 // Final stage. 465 // Also compute partial rounding shift: 466 // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 467 out[0] = sub_round_shift(b[0]); 468 out[16] = sub_round_shift(b[1]); 469 out[8] = sub_round_shift(b[2]); 470 out[24] = sub_round_shift(b[3]); 471 out[4] = sub_round_shift(b[4]); 472 out[20] = sub_round_shift(b[5]); 473 out[12] = sub_round_shift(b[6]); 474 out[28] = sub_round_shift(b[7]); 475 out[2] = sub_round_shift(b[8]); 476 out[18] = sub_round_shift(b[9]); 477 out[10] = sub_round_shift(b[10]); 478 out[26] = sub_round_shift(b[11]); 479 out[6] = sub_round_shift(b[12]); 480 out[22] = sub_round_shift(b[13]); 481 out[14] = sub_round_shift(b[14]); 482 out[30] = sub_round_shift(b[15]); 483 484 butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]); 485 out[1] = sub_round_shift(a[1]); 486 out[31] = sub_round_shift(a[31]); 487 488 butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]); 489 out[17] = sub_round_shift(a[17]); 490 out[15] = sub_round_shift(a[15]); 491 492 butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]); 493 out[9] = sub_round_shift(a[9]); 494 out[23] = sub_round_shift(a[23]); 495 496 butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]); 497 out[25] = sub_round_shift(a[25]); 498 out[7] = sub_round_shift(a[7]); 499 500 butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]); 501 out[5] = sub_round_shift(a[5]); 502 out[27] = sub_round_shift(a[27]); 503 504 butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]); 505 out[21] = sub_round_shift(a[21]); 506 out[11] = sub_round_shift(a[11]); 507 508 butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]); 509 out[13] = sub_round_shift(a[13]); 510 out[19] = sub_round_shift(a[19]); 511 512 butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]); 513 out[29] = sub_round_shift(a[29]); 514 out[3] = sub_round_shift(a[3]); 515} 516 517#define PASS_THROUGH(src, dst, element) \ 518 do { \ 519 dst##_lo[element] = src##_lo[element]; \ 520 dst##_hi[element] = src##_hi[element]; \ 521 } while (0) 522 523#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ 524 do { \ 525 b##_lo[b_index] = \ 526 vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ 527 b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ 528 vget_high_s16(a[right_index])); \ 529 } while (0) 530 531#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ 532 do { \ 533 b##_lo[b_index] = \ 534 vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ 535 b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ 536 vget_high_s16(a[right_index])); \ 537 } while (0) 538 539#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ 540 do { \ 541 c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ 542 c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ 543 } while (0) 544 545#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ 546 do { \ 547 temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ 548 temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ 549 c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ 550 c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ 551 } while (0) 552 553#define ADD_S32(a, left_index, right_index, b, b_index) \ 554 do { \ 555 b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ 556 b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ 557 } while (0) 558 559#define SUB_S32(a, left_index, right_index, b, b_index) \ 560 do { \ 561 b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ 562 b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ 563 } while (0) 564 565// Like butterfly_one_coeff, but don't narrow results. 566static INLINE void butterfly_one_coeff_s16_s32( 567 const int16x8_t a, const int16x8_t b, const tran_high_t constant, 568 int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, 569 int32x4_t *sub_hi) { 570 const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); 571 const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); 572 const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); 573 const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); 574 const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); 575 const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); 576 *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); 577 *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); 578 *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); 579 *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); 580} 581 582#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ 583 add_index, sub_index) \ 584 do { \ 585 butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ 586 &b##_lo[add_index], &b##_hi[add_index], \ 587 &b##_lo[sub_index], &b##_hi[sub_index]); \ 588 } while (0) 589 590// Like butterfly_one_coeff, but with s32. 591static INLINE void butterfly_one_coeff_s32( 592 const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, 593 const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo, 594 int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { 595 const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant); 596 const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant); 597 const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant); 598 const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant); 599 const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant); 600 const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant); 601 *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); 602 *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); 603 *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); 604 *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); 605} 606 607#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ 608 sub_index) \ 609 do { \ 610 butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ 611 a##_lo[right_index], a##_hi[right_index], \ 612 constant, &b##_lo[add_index], &b##_hi[add_index], \ 613 &b##_lo[sub_index], &b##_hi[sub_index]); \ 614 } while (0) 615 616// Like butterfly_two_coeff, but with s32. 617static INLINE void butterfly_two_coeff_s32( 618 const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, 619 const int32x4_t b_hi, const int32_t constant0, const int32_t constant1, 620 int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, 621 int32x4_t *sub_hi) { 622 const int32x4_t a0 = vmulq_n_s32(a_lo, constant0); 623 const int32x4_t a1 = vmulq_n_s32(a_hi, constant0); 624 const int32x4_t a2 = vmulq_n_s32(a_lo, constant1); 625 const int32x4_t a3 = vmulq_n_s32(a_hi, constant1); 626 const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0); 627 const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0); 628 const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1); 629 const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1); 630 *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); 631 *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); 632 *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); 633 *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); 634} 635 636#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ 637 right_constant, b, add_index, sub_index) \ 638 do { \ 639 butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ 640 a##_lo[right_index], a##_hi[right_index], \ 641 left_constant, right_constant, &b##_lo[add_index], \ 642 &b##_hi[add_index], &b##_lo[sub_index], \ 643 &b##_hi[sub_index]); \ 644 } while (0) 645 646// Add 1 if positive, 2 if negative, and shift by 2. 647// In practice, add 1, then add the sign bit, then shift without rounding. 648static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo, 649 const int32x4_t a_hi) { 650 const int32x4_t one = vdupq_n_s32(1); 651 const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); 652 const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); 653 const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32); 654 const int16x4_t b_lo = 655 vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2); 656 const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi); 657 const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31); 658 const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32); 659 const int16x4_t b_hi = 660 vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2); 661 return vcombine_s16(b_lo, b_hi); 662} 663 664static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { 665 int16x8_t a[32]; 666 int16x8_t b[32]; 667 int32x4_t c_lo[32]; 668 int32x4_t c_hi[32]; 669 int32x4_t d_lo[32]; 670 int32x4_t d_hi[32]; 671 672 // Stage 1. Done as part of the load for the first pass. 673 a[0] = vaddq_s16(in[0], in[31]); 674 a[1] = vaddq_s16(in[1], in[30]); 675 a[2] = vaddq_s16(in[2], in[29]); 676 a[3] = vaddq_s16(in[3], in[28]); 677 a[4] = vaddq_s16(in[4], in[27]); 678 a[5] = vaddq_s16(in[5], in[26]); 679 a[6] = vaddq_s16(in[6], in[25]); 680 a[7] = vaddq_s16(in[7], in[24]); 681 a[8] = vaddq_s16(in[8], in[23]); 682 a[9] = vaddq_s16(in[9], in[22]); 683 a[10] = vaddq_s16(in[10], in[21]); 684 a[11] = vaddq_s16(in[11], in[20]); 685 a[12] = vaddq_s16(in[12], in[19]); 686 a[13] = vaddq_s16(in[13], in[18]); 687 a[14] = vaddq_s16(in[14], in[17]); 688 a[15] = vaddq_s16(in[15], in[16]); 689 a[16] = vsubq_s16(in[15], in[16]); 690 a[17] = vsubq_s16(in[14], in[17]); 691 a[18] = vsubq_s16(in[13], in[18]); 692 a[19] = vsubq_s16(in[12], in[19]); 693 a[20] = vsubq_s16(in[11], in[20]); 694 a[21] = vsubq_s16(in[10], in[21]); 695 a[22] = vsubq_s16(in[9], in[22]); 696 a[23] = vsubq_s16(in[8], in[23]); 697 a[24] = vsubq_s16(in[7], in[24]); 698 a[25] = vsubq_s16(in[6], in[25]); 699 a[26] = vsubq_s16(in[5], in[26]); 700 a[27] = vsubq_s16(in[4], in[27]); 701 a[28] = vsubq_s16(in[3], in[28]); 702 a[29] = vsubq_s16(in[2], in[29]); 703 a[30] = vsubq_s16(in[1], in[30]); 704 a[31] = vsubq_s16(in[0], in[31]); 705 706 // Stage 2. 707 b[0] = vaddq_s16(a[0], a[15]); 708 b[1] = vaddq_s16(a[1], a[14]); 709 b[2] = vaddq_s16(a[2], a[13]); 710 b[3] = vaddq_s16(a[3], a[12]); 711 b[4] = vaddq_s16(a[4], a[11]); 712 b[5] = vaddq_s16(a[5], a[10]); 713 b[6] = vaddq_s16(a[6], a[9]); 714 b[7] = vaddq_s16(a[7], a[8]); 715 716 b[8] = vsubq_s16(a[7], a[8]); 717 b[9] = vsubq_s16(a[6], a[9]); 718 b[10] = vsubq_s16(a[5], a[10]); 719 b[11] = vsubq_s16(a[4], a[11]); 720 b[12] = vsubq_s16(a[3], a[12]); 721 b[13] = vsubq_s16(a[2], a[13]); 722 b[14] = vsubq_s16(a[1], a[14]); 723 b[15] = vsubq_s16(a[0], a[15]); 724 725 b[16] = a[16]; 726 b[17] = a[17]; 727 b[18] = a[18]; 728 b[19] = a[19]; 729 730 butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); 731 butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); 732 butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); 733 butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); 734 735 b[28] = a[28]; 736 b[29] = a[29]; 737 b[30] = a[30]; 738 b[31] = a[31]; 739 740 // Stage 3. With extreme values for input this calculation rolls over int16_t. 741 // The sources for b[0] get added multiple times and, through testing, have 742 // been shown to overflow starting here. 743 ADD_S16_S32(b, 0, 7, c, 0); 744 ADD_S16_S32(b, 1, 6, c, 1); 745 ADD_S16_S32(b, 2, 5, c, 2); 746 ADD_S16_S32(b, 3, 4, c, 3); 747 SUB_S16_S32(b, 3, 4, c, 4); 748 SUB_S16_S32(b, 2, 5, c, 5); 749 SUB_S16_S32(b, 1, 6, c, 6); 750 SUB_S16_S32(b, 0, 7, c, 7); 751 752 a[8] = b[8]; 753 a[9] = b[9]; 754 755 BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); 756 BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); 757 758 a[14] = b[14]; 759 a[15] = b[15]; 760 761 ADD_S16_S32(b, 16, 23, c, 16); 762 ADD_S16_S32(b, 17, 22, c, 17); 763 ADD_S16_S32(b, 18, 21, c, 18); 764 ADD_S16_S32(b, 19, 20, c, 19); 765 SUB_S16_S32(b, 19, 20, c, 20); 766 SUB_S16_S32(b, 18, 21, c, 21); 767 SUB_S16_S32(b, 17, 22, c, 22); 768 SUB_S16_S32(b, 16, 23, c, 23); 769 SUB_S16_S32(b, 31, 24, c, 24); 770 SUB_S16_S32(b, 30, 25, c, 25); 771 SUB_S16_S32(b, 29, 26, c, 26); 772 SUB_S16_S32(b, 28, 27, c, 27); 773 ADD_S16_S32(b, 28, 27, c, 28); 774 ADD_S16_S32(b, 29, 26, c, 29); 775 ADD_S16_S32(b, 30, 25, c, 30); 776 ADD_S16_S32(b, 31, 24, c, 31); 777 778 // Stage 4. 779 ADD_S32(c, 0, 3, d, 0); 780 ADD_S32(c, 1, 2, d, 1); 781 SUB_S32(c, 1, 2, d, 2); 782 SUB_S32(c, 0, 3, d, 3); 783 784 PASS_THROUGH(c, d, 4); 785 786 BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); 787 788 PASS_THROUGH(c, d, 7); 789 790 ADDW_S16_S32(c, 11, a, 8, d, 8); 791 ADDW_S16_S32(c, 10, a, 9, d, 9); 792 SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); 793 SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); 794 SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); 795 SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); 796 ADDW_S16_S32(c, 13, b, 14, d, 14); 797 ADDW_S16_S32(c, 12, b, 15, d, 15); 798 799 PASS_THROUGH(c, d, 16); 800 PASS_THROUGH(c, d, 17); 801 802 BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18); 803 BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19); 804 BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20); 805 BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21); 806 807 PASS_THROUGH(c, d, 22); 808 PASS_THROUGH(c, d, 23); 809 PASS_THROUGH(c, d, 24); 810 PASS_THROUGH(c, d, 25); 811 812 PASS_THROUGH(c, d, 30); 813 PASS_THROUGH(c, d, 31); 814 815 // Stage 5. 816 BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); 817 BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3); 818 819 ADD_S32(d, 4, 5, c, 4); 820 SUB_S32(d, 4, 5, c, 5); 821 SUB_S32(d, 7, 6, c, 6); 822 ADD_S32(d, 7, 6, c, 7); 823 824 PASS_THROUGH(d, c, 8); 825 826 BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9); 827 BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10); 828 829 PASS_THROUGH(d, c, 11); 830 PASS_THROUGH(d, c, 12); 831 PASS_THROUGH(d, c, 15); 832 833 ADD_S32(d, 16, 19, c, 16); 834 ADD_S32(d, 17, 18, c, 17); 835 SUB_S32(d, 17, 18, c, 18); 836 SUB_S32(d, 16, 19, c, 19); 837 SUB_S32(d, 23, 20, c, 20); 838 SUB_S32(d, 22, 21, c, 21); 839 ADD_S32(d, 22, 21, c, 22); 840 ADD_S32(d, 23, 20, c, 23); 841 ADD_S32(d, 24, 27, c, 24); 842 ADD_S32(d, 25, 26, c, 25); 843 SUB_S32(d, 25, 26, c, 26); 844 SUB_S32(d, 24, 27, c, 27); 845 SUB_S32(d, 31, 28, c, 28); 846 SUB_S32(d, 30, 29, c, 29); 847 ADD_S32(d, 30, 29, c, 30); 848 ADD_S32(d, 31, 28, c, 31); 849 850 // Stage 6. 851 PASS_THROUGH(c, d, 0); 852 PASS_THROUGH(c, d, 1); 853 PASS_THROUGH(c, d, 2); 854 PASS_THROUGH(c, d, 3); 855 856 BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7); 857 BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6); 858 859 ADD_S32(c, 8, 9, d, 8); 860 SUB_S32(c, 8, 9, d, 9); 861 SUB_S32(c, 11, 10, d, 10); 862 ADD_S32(c, 11, 10, d, 11); 863 ADD_S32(c, 12, 13, d, 12); 864 SUB_S32(c, 12, 13, d, 13); 865 SUB_S32(c, 15, 14, d, 14); 866 ADD_S32(c, 15, 14, d, 15); 867 868 PASS_THROUGH(c, d, 16); 869 PASS_THROUGH(c, d, 19); 870 PASS_THROUGH(c, d, 20); 871 PASS_THROUGH(c, d, 23); 872 PASS_THROUGH(c, d, 24); 873 PASS_THROUGH(c, d, 27); 874 PASS_THROUGH(c, d, 28); 875 PASS_THROUGH(c, d, 31); 876 877 BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17); 878 BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18); 879 BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21); 880 BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22); 881 882 // Stage 7. 883 PASS_THROUGH(d, c, 0); 884 PASS_THROUGH(d, c, 1); 885 PASS_THROUGH(d, c, 2); 886 PASS_THROUGH(d, c, 3); 887 PASS_THROUGH(d, c, 4); 888 PASS_THROUGH(d, c, 5); 889 PASS_THROUGH(d, c, 6); 890 PASS_THROUGH(d, c, 7); 891 892 BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15); 893 BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14); 894 BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13); 895 BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12); 896 897 ADD_S32(d, 16, 17, c, 16); 898 SUB_S32(d, 16, 17, c, 17); 899 SUB_S32(d, 19, 18, c, 18); 900 ADD_S32(d, 19, 18, c, 19); 901 ADD_S32(d, 20, 21, c, 20); 902 SUB_S32(d, 20, 21, c, 21); 903 SUB_S32(d, 23, 22, c, 22); 904 ADD_S32(d, 23, 22, c, 23); 905 ADD_S32(d, 24, 25, c, 24); 906 SUB_S32(d, 24, 25, c, 25); 907 SUB_S32(d, 27, 26, c, 26); 908 ADD_S32(d, 27, 26, c, 27); 909 ADD_S32(d, 28, 29, c, 28); 910 SUB_S32(d, 28, 29, c, 29); 911 SUB_S32(d, 31, 30, c, 30); 912 ADD_S32(d, 31, 30, c, 31); 913 914 // Final stage. 915 // Roll rounding into this function so we can pass back int16x8. 916 917 out[0] = add_round_shift_s32(c_lo[0], c_hi[0]); 918 out[16] = add_round_shift_s32(c_lo[1], c_hi[1]); 919 920 out[8] = add_round_shift_s32(c_lo[2], c_hi[2]); 921 out[24] = add_round_shift_s32(c_lo[3], c_hi[3]); 922 out[4] = add_round_shift_s32(c_lo[4], c_hi[4]); 923 out[20] = add_round_shift_s32(c_lo[5], c_hi[5]); 924 out[12] = add_round_shift_s32(c_lo[6], c_hi[6]); 925 926 out[28] = add_round_shift_s32(c_lo[7], c_hi[7]); 927 out[2] = add_round_shift_s32(c_lo[8], c_hi[8]); 928 out[18] = add_round_shift_s32(c_lo[9], c_hi[9]); 929 out[10] = add_round_shift_s32(c_lo[10], c_hi[10]); 930 931 out[26] = add_round_shift_s32(c_lo[11], c_hi[11]); 932 out[6] = add_round_shift_s32(c_lo[12], c_hi[12]); 933 out[22] = add_round_shift_s32(c_lo[13], c_hi[13]); 934 out[14] = add_round_shift_s32(c_lo[14], c_hi[14]); 935 out[30] = add_round_shift_s32(c_lo[15], c_hi[15]); 936 937 BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31); 938 out[1] = add_round_shift_s32(d_lo[1], d_hi[1]); 939 out[31] = add_round_shift_s32(d_lo[31], d_hi[31]); 940 941 BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15); 942 out[17] = add_round_shift_s32(d_lo[17], d_hi[17]); 943 out[15] = add_round_shift_s32(d_lo[15], d_hi[15]); 944 945 BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23); 946 out[9] = add_round_shift_s32(d_lo[9], d_hi[9]); 947 out[23] = add_round_shift_s32(d_lo[23], d_hi[23]); 948 949 BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7); 950 out[25] = add_round_shift_s32(d_lo[25], d_hi[25]); 951 out[7] = add_round_shift_s32(d_lo[7], d_hi[7]); 952 953 BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27); 954 out[5] = add_round_shift_s32(d_lo[5], d_hi[5]); 955 out[27] = add_round_shift_s32(d_lo[27], d_hi[27]); 956 957 BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11); 958 out[21] = add_round_shift_s32(d_lo[21], d_hi[21]); 959 out[11] = add_round_shift_s32(d_lo[11], d_hi[11]); 960 961 BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19); 962 out[13] = add_round_shift_s32(d_lo[13], d_hi[13]); 963 out[19] = add_round_shift_s32(d_lo[19], d_hi[19]); 964 965 BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3); 966 out[29] = add_round_shift_s32(d_lo[29], d_hi[29]); 967 out[3] = add_round_shift_s32(d_lo[3], d_hi[3]); 968} 969 970// Add 1 if positive, 2 if negative, and shift by 2. 971// In practice, add 1, then add the sign bit, then shift without rounding. 972static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { 973 const int16x8_t one = vdupq_n_s16(1); 974 const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); 975 const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); 976 const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); 977 return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2); 978} 979 980static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { 981 int16x8_t a[32]; 982 int16x8_t b[32]; 983 984 // Stage 1. Done as part of the load for the first pass. 985 a[0] = vaddq_s16(in[0], in[31]); 986 a[1] = vaddq_s16(in[1], in[30]); 987 a[2] = vaddq_s16(in[2], in[29]); 988 a[3] = vaddq_s16(in[3], in[28]); 989 a[4] = vaddq_s16(in[4], in[27]); 990 a[5] = vaddq_s16(in[5], in[26]); 991 a[6] = vaddq_s16(in[6], in[25]); 992 a[7] = vaddq_s16(in[7], in[24]); 993 a[8] = vaddq_s16(in[8], in[23]); 994 a[9] = vaddq_s16(in[9], in[22]); 995 a[10] = vaddq_s16(in[10], in[21]); 996 a[11] = vaddq_s16(in[11], in[20]); 997 a[12] = vaddq_s16(in[12], in[19]); 998 a[13] = vaddq_s16(in[13], in[18]); 999 a[14] = vaddq_s16(in[14], in[17]); 1000 a[15] = vaddq_s16(in[15], in[16]); 1001 a[16] = vsubq_s16(in[15], in[16]); 1002 a[17] = vsubq_s16(in[14], in[17]); 1003 a[18] = vsubq_s16(in[13], in[18]); 1004 a[19] = vsubq_s16(in[12], in[19]); 1005 a[20] = vsubq_s16(in[11], in[20]); 1006 a[21] = vsubq_s16(in[10], in[21]); 1007 a[22] = vsubq_s16(in[9], in[22]); 1008 a[23] = vsubq_s16(in[8], in[23]); 1009 a[24] = vsubq_s16(in[7], in[24]); 1010 a[25] = vsubq_s16(in[6], in[25]); 1011 a[26] = vsubq_s16(in[5], in[26]); 1012 a[27] = vsubq_s16(in[4], in[27]); 1013 a[28] = vsubq_s16(in[3], in[28]); 1014 a[29] = vsubq_s16(in[2], in[29]); 1015 a[30] = vsubq_s16(in[1], in[30]); 1016 a[31] = vsubq_s16(in[0], in[31]); 1017 1018 // Stage 2. 1019 // For the "rd" version, all the values are rounded down after stage 2 to keep 1020 // the values in 16 bits. 1021 b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); 1022 b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); 1023 b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); 1024 b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); 1025 b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); 1026 b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); 1027 b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); 1028 b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); 1029 1030 b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); 1031 b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); 1032 b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); 1033 b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); 1034 b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); 1035 b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); 1036 b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); 1037 b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); 1038 1039 b[16] = add_round_shift_s16(a[16]); 1040 b[17] = add_round_shift_s16(a[17]); 1041 b[18] = add_round_shift_s16(a[18]); 1042 b[19] = add_round_shift_s16(a[19]); 1043 1044 butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); 1045 butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); 1046 butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); 1047 butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); 1048 b[20] = add_round_shift_s16(b[20]); 1049 b[21] = add_round_shift_s16(b[21]); 1050 b[22] = add_round_shift_s16(b[22]); 1051 b[23] = add_round_shift_s16(b[23]); 1052 b[24] = add_round_shift_s16(b[24]); 1053 b[25] = add_round_shift_s16(b[25]); 1054 b[26] = add_round_shift_s16(b[26]); 1055 b[27] = add_round_shift_s16(b[27]); 1056 1057 b[28] = add_round_shift_s16(a[28]); 1058 b[29] = add_round_shift_s16(a[29]); 1059 b[30] = add_round_shift_s16(a[30]); 1060 b[31] = add_round_shift_s16(a[31]); 1061 1062 // Stage 3. 1063 a[0] = vaddq_s16(b[0], b[7]); 1064 a[1] = vaddq_s16(b[1], b[6]); 1065 a[2] = vaddq_s16(b[2], b[5]); 1066 a[3] = vaddq_s16(b[3], b[4]); 1067 1068 a[4] = vsubq_s16(b[3], b[4]); 1069 a[5] = vsubq_s16(b[2], b[5]); 1070 a[6] = vsubq_s16(b[1], b[6]); 1071 a[7] = vsubq_s16(b[0], b[7]); 1072 1073 a[8] = b[8]; 1074 a[9] = b[9]; 1075 1076 butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]); 1077 butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]); 1078 1079 a[14] = b[14]; 1080 a[15] = b[15]; 1081 1082 a[16] = vaddq_s16(b[16], b[23]); 1083 a[17] = vaddq_s16(b[17], b[22]); 1084 a[18] = vaddq_s16(b[18], b[21]); 1085 a[19] = vaddq_s16(b[19], b[20]); 1086 1087 a[20] = vsubq_s16(b[19], b[20]); 1088 a[21] = vsubq_s16(b[18], b[21]); 1089 a[22] = vsubq_s16(b[17], b[22]); 1090 a[23] = vsubq_s16(b[16], b[23]); 1091 1092 a[24] = vsubq_s16(b[31], b[24]); 1093 a[25] = vsubq_s16(b[30], b[25]); 1094 a[26] = vsubq_s16(b[29], b[26]); 1095 a[27] = vsubq_s16(b[28], b[27]); 1096 1097 a[28] = vaddq_s16(b[28], b[27]); 1098 a[29] = vaddq_s16(b[29], b[26]); 1099 a[30] = vaddq_s16(b[30], b[25]); 1100 a[31] = vaddq_s16(b[31], b[24]); 1101 1102 // Stage 4. 1103 b[0] = vaddq_s16(a[0], a[3]); 1104 b[1] = vaddq_s16(a[1], a[2]); 1105 b[2] = vsubq_s16(a[1], a[2]); 1106 b[3] = vsubq_s16(a[0], a[3]); 1107 1108 b[4] = a[4]; 1109 1110 butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]); 1111 1112 b[7] = a[7]; 1113 1114 b[8] = vaddq_s16(a[8], a[11]); 1115 b[9] = vaddq_s16(a[9], a[10]); 1116 b[10] = vsubq_s16(a[9], a[10]); 1117 b[11] = vsubq_s16(a[8], a[11]); 1118 b[12] = vsubq_s16(a[15], a[12]); 1119 b[13] = vsubq_s16(a[14], a[13]); 1120 b[14] = vaddq_s16(a[14], a[13]); 1121 b[15] = vaddq_s16(a[15], a[12]); 1122 1123 b[16] = a[16]; 1124 b[17] = a[17]; 1125 1126 butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]); 1127 butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]); 1128 butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]); 1129 butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]); 1130 1131 b[22] = a[22]; 1132 b[23] = a[23]; 1133 b[24] = a[24]; 1134 b[25] = a[25]; 1135 1136 b[30] = a[30]; 1137 b[31] = a[31]; 1138 1139 // Stage 5. 1140 butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]); 1141 butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]); 1142 1143 a[4] = vaddq_s16(b[4], b[5]); 1144 a[5] = vsubq_s16(b[4], b[5]); 1145 a[6] = vsubq_s16(b[7], b[6]); 1146 a[7] = vaddq_s16(b[7], b[6]); 1147 1148 a[8] = b[8]; 1149 1150 butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]); 1151 butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]); 1152 1153 a[11] = b[11]; 1154 a[12] = b[12]; 1155 1156 a[15] = b[15]; 1157 1158 a[16] = vaddq_s16(b[19], b[16]); 1159 a[17] = vaddq_s16(b[18], b[17]); 1160 a[18] = vsubq_s16(b[17], b[18]); 1161 a[19] = vsubq_s16(b[16], b[19]); 1162 a[20] = vsubq_s16(b[23], b[20]); 1163 a[21] = vsubq_s16(b[22], b[21]); 1164 a[22] = vaddq_s16(b[21], b[22]); 1165 a[23] = vaddq_s16(b[20], b[23]); 1166 a[24] = vaddq_s16(b[27], b[24]); 1167 a[25] = vaddq_s16(b[26], b[25]); 1168 a[26] = vsubq_s16(b[25], b[26]); 1169 a[27] = vsubq_s16(b[24], b[27]); 1170 a[28] = vsubq_s16(b[31], b[28]); 1171 a[29] = vsubq_s16(b[30], b[29]); 1172 a[30] = vaddq_s16(b[29], b[30]); 1173 a[31] = vaddq_s16(b[28], b[31]); 1174 1175 // Stage 6. 1176 b[0] = a[0]; 1177 b[1] = a[1]; 1178 b[2] = a[2]; 1179 b[3] = a[3]; 1180 1181 butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]); 1182 butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]); 1183 1184 b[8] = vaddq_s16(a[8], a[9]); 1185 b[9] = vsubq_s16(a[8], a[9]); 1186 b[10] = vsubq_s16(a[11], a[10]); 1187 b[11] = vaddq_s16(a[11], a[10]); 1188 b[12] = vaddq_s16(a[12], a[13]); 1189 b[13] = vsubq_s16(a[12], a[13]); 1190 b[14] = vsubq_s16(a[15], a[14]); 1191 b[15] = vaddq_s16(a[15], a[14]); 1192 1193 b[16] = a[16]; 1194 b[19] = a[19]; 1195 b[20] = a[20]; 1196 b[23] = a[23]; 1197 b[24] = a[24]; 1198 b[27] = a[27]; 1199 b[28] = a[28]; 1200 b[31] = a[31]; 1201 1202 butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]); 1203 butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]); 1204 1205 butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]); 1206 butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]); 1207 1208 // Stage 7. 1209 a[0] = b[0]; 1210 a[1] = b[1]; 1211 a[2] = b[2]; 1212 a[3] = b[3]; 1213 a[4] = b[4]; 1214 a[5] = b[5]; 1215 a[6] = b[6]; 1216 a[7] = b[7]; 1217 1218 butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]); 1219 butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]); 1220 butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]); 1221 butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]); 1222 1223 a[16] = vaddq_s16(b[16], b[17]); 1224 a[17] = vsubq_s16(b[16], b[17]); 1225 a[18] = vsubq_s16(b[19], b[18]); 1226 a[19] = vaddq_s16(b[19], b[18]); 1227 a[20] = vaddq_s16(b[20], b[21]); 1228 a[21] = vsubq_s16(b[20], b[21]); 1229 a[22] = vsubq_s16(b[23], b[22]); 1230 a[23] = vaddq_s16(b[23], b[22]); 1231 a[24] = vaddq_s16(b[24], b[25]); 1232 a[25] = vsubq_s16(b[24], b[25]); 1233 a[26] = vsubq_s16(b[27], b[26]); 1234 a[27] = vaddq_s16(b[27], b[26]); 1235 a[28] = vaddq_s16(b[28], b[29]); 1236 a[29] = vsubq_s16(b[28], b[29]); 1237 a[30] = vsubq_s16(b[31], b[30]); 1238 a[31] = vaddq_s16(b[31], b[30]); 1239 1240 // Final stage. 1241 out[0] = a[0]; 1242 out[16] = a[1]; 1243 out[8] = a[2]; 1244 out[24] = a[3]; 1245 out[4] = a[4]; 1246 out[20] = a[5]; 1247 out[12] = a[6]; 1248 out[28] = a[7]; 1249 out[2] = a[8]; 1250 out[18] = a[9]; 1251 out[10] = a[10]; 1252 out[26] = a[11]; 1253 out[6] = a[12]; 1254 out[22] = a[13]; 1255 out[14] = a[14]; 1256 out[30] = a[15]; 1257 1258 butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]); 1259 butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17], 1260 &out[15]); 1261 butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]); 1262 butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]); 1263 butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]); 1264 butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21], 1265 &out[11]); 1266 butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13], 1267 &out[19]); 1268 butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]); 1269} 1270 1271#undef PASS_THROUGH 1272#undef ADD_S16_S32 1273#undef SUB_S16_S32 1274#undef ADDW_S16_S32 1275#undef SUBW_S16_S32 1276#undef ADD_S32 1277#undef SUB_S32 1278#undef BUTTERFLY_ONE_S16_S32 1279#undef BUTTERFLY_ONE_S32 1280#undef BUTTERFLY_TWO_S32 1281 1282// Transpose 8x8 to a new location. Don't use transpose_neon.h because those 1283// are all in-place. 1284// TODO(johannkoenig): share with other fdcts. 1285static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) { 1286 // Swap 16 bit elements. 1287 const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); 1288 const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); 1289 const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); 1290 const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); 1291 1292 // Swap 32 bit elements. 1293 const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), 1294 vreinterpretq_s32_s16(c1.val[0])); 1295 const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), 1296 vreinterpretq_s32_s16(c1.val[1])); 1297 const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), 1298 vreinterpretq_s32_s16(c3.val[0])); 1299 const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), 1300 vreinterpretq_s32_s16(c3.val[1])); 1301 1302 // Swap 64 bit elements 1303 const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); 1304 const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); 1305 const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); 1306 const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); 1307 1308 b[0] = e0.val[0]; 1309 b[1] = e1.val[0]; 1310 b[2] = e2.val[0]; 1311 b[3] = e3.val[0]; 1312 b[4] = e0.val[1]; 1313 b[5] = e1.val[1]; 1314 b[6] = e2.val[1]; 1315 b[7] = e3.val[1]; 1316} 1317 1318void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { 1319 int16x8_t temp0[32]; 1320 int16x8_t temp1[32]; 1321 int16x8_t temp2[32]; 1322 int16x8_t temp3[32]; 1323 int16x8_t temp4[32]; 1324 int16x8_t temp5[32]; 1325 1326 // Process in 8x32 columns. 1327 load(input, stride, temp0); 1328 dct_body_first_pass(temp0, temp1); 1329 1330 load(input + 8, stride, temp0); 1331 dct_body_first_pass(temp0, temp2); 1332 1333 load(input + 16, stride, temp0); 1334 dct_body_first_pass(temp0, temp3); 1335 1336 load(input + 24, stride, temp0); 1337 dct_body_first_pass(temp0, temp4); 1338 1339 // Generate the top row by munging the first set of 8 from each one together. 1340 transpose_8x8(&temp1[0], &temp0[0]); 1341 transpose_8x8(&temp2[0], &temp0[8]); 1342 transpose_8x8(&temp3[0], &temp0[16]); 1343 transpose_8x8(&temp4[0], &temp0[24]); 1344 1345 dct_body_second_pass(temp0, temp5); 1346 1347 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], 1348 &temp5[5], &temp5[6], &temp5[7]); 1349 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], 1350 &temp5[13], &temp5[14], &temp5[15]); 1351 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], 1352 &temp5[21], &temp5[22], &temp5[23]); 1353 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], 1354 &temp5[29], &temp5[30], &temp5[31]); 1355 store(output, temp5); 1356 1357 // Second row of 8x32. 1358 transpose_8x8(&temp1[8], &temp0[0]); 1359 transpose_8x8(&temp2[8], &temp0[8]); 1360 transpose_8x8(&temp3[8], &temp0[16]); 1361 transpose_8x8(&temp4[8], &temp0[24]); 1362 1363 dct_body_second_pass(temp0, temp5); 1364 1365 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], 1366 &temp5[5], &temp5[6], &temp5[7]); 1367 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], 1368 &temp5[13], &temp5[14], &temp5[15]); 1369 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], 1370 &temp5[21], &temp5[22], &temp5[23]); 1371 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], 1372 &temp5[29], &temp5[30], &temp5[31]); 1373 store(output + 8 * 32, temp5); 1374 1375 // Third row of 8x32 1376 transpose_8x8(&temp1[16], &temp0[0]); 1377 transpose_8x8(&temp2[16], &temp0[8]); 1378 transpose_8x8(&temp3[16], &temp0[16]); 1379 transpose_8x8(&temp4[16], &temp0[24]); 1380 1381 dct_body_second_pass(temp0, temp5); 1382 1383 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], 1384 &temp5[5], &temp5[6], &temp5[7]); 1385 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], 1386 &temp5[13], &temp5[14], &temp5[15]); 1387 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], 1388 &temp5[21], &temp5[22], &temp5[23]); 1389 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], 1390 &temp5[29], &temp5[30], &temp5[31]); 1391 store(output + 16 * 32, temp5); 1392 1393 // Final row of 8x32. 1394 transpose_8x8(&temp1[24], &temp0[0]); 1395 transpose_8x8(&temp2[24], &temp0[8]); 1396 transpose_8x8(&temp3[24], &temp0[16]); 1397 transpose_8x8(&temp4[24], &temp0[24]); 1398 1399 dct_body_second_pass(temp0, temp5); 1400 1401 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], 1402 &temp5[5], &temp5[6], &temp5[7]); 1403 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], 1404 &temp5[13], &temp5[14], &temp5[15]); 1405 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], 1406 &temp5[21], &temp5[22], &temp5[23]); 1407 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], 1408 &temp5[29], &temp5[30], &temp5[31]); 1409 store(output + 24 * 32, temp5); 1410} 1411 1412void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, 1413 int stride) { 1414 int16x8_t temp0[32]; 1415 int16x8_t temp1[32]; 1416 int16x8_t temp2[32]; 1417 int16x8_t temp3[32]; 1418 int16x8_t temp4[32]; 1419 int16x8_t temp5[32]; 1420 1421 // Process in 8x32 columns. 1422 load(input, stride, temp0); 1423 dct_body_first_pass(temp0, temp1); 1424 1425 load(input + 8, stride, temp0); 1426 dct_body_first_pass(temp0, temp2); 1427 1428 load(input + 16, stride, temp0); 1429 dct_body_first_pass(temp0, temp3); 1430 1431 load(input + 24, stride, temp0); 1432 dct_body_first_pass(temp0, temp4); 1433 1434 // Generate the top row by munging the first set of 8 from each one together. 1435 transpose_8x8(&temp1[0], &temp0[0]); 1436 transpose_8x8(&temp2[0], &temp0[8]); 1437 transpose_8x8(&temp3[0], &temp0[16]); 1438 transpose_8x8(&temp4[0], &temp0[24]); 1439 1440 dct_body_second_pass_rd(temp0, temp5); 1441 1442 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], 1443 &temp5[5], &temp5[6], &temp5[7]); 1444 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], 1445 &temp5[13], &temp5[14], &temp5[15]); 1446 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], 1447 &temp5[21], &temp5[22], &temp5[23]); 1448 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], 1449 &temp5[29], &temp5[30], &temp5[31]); 1450 store(output, temp5); 1451 1452 // Second row of 8x32. 1453 transpose_8x8(&temp1[8], &temp0[0]); 1454 transpose_8x8(&temp2[8], &temp0[8]); 1455 transpose_8x8(&temp3[8], &temp0[16]); 1456 transpose_8x8(&temp4[8], &temp0[24]); 1457 1458 dct_body_second_pass_rd(temp0, temp5); 1459 1460 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], 1461 &temp5[5], &temp5[6], &temp5[7]); 1462 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], 1463 &temp5[13], &temp5[14], &temp5[15]); 1464 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], 1465 &temp5[21], &temp5[22], &temp5[23]); 1466 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], 1467 &temp5[29], &temp5[30], &temp5[31]); 1468 store(output + 8 * 32, temp5); 1469 1470 // Third row of 8x32 1471 transpose_8x8(&temp1[16], &temp0[0]); 1472 transpose_8x8(&temp2[16], &temp0[8]); 1473 transpose_8x8(&temp3[16], &temp0[16]); 1474 transpose_8x8(&temp4[16], &temp0[24]); 1475 1476 dct_body_second_pass_rd(temp0, temp5); 1477 1478 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], 1479 &temp5[5], &temp5[6], &temp5[7]); 1480 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], 1481 &temp5[13], &temp5[14], &temp5[15]); 1482 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], 1483 &temp5[21], &temp5[22], &temp5[23]); 1484 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], 1485 &temp5[29], &temp5[30], &temp5[31]); 1486 store(output + 16 * 32, temp5); 1487 1488 // Final row of 8x32. 1489 transpose_8x8(&temp1[24], &temp0[0]); 1490 transpose_8x8(&temp2[24], &temp0[8]); 1491 transpose_8x8(&temp3[24], &temp0[16]); 1492 transpose_8x8(&temp4[24], &temp0[24]); 1493 1494 dct_body_second_pass_rd(temp0, temp5); 1495 1496 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], 1497 &temp5[5], &temp5[6], &temp5[7]); 1498 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], 1499 &temp5[13], &temp5[14], &temp5[15]); 1500 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], 1501 &temp5[21], &temp5[22], &temp5[23]); 1502 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], 1503 &temp5[29], &temp5[30], &temp5[31]); 1504 store(output + 24 * 32, temp5); 1505} 1506#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && 1507 // __GNUC__ == 4 && __GNUC_MINOR__ <= 9 1508