1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13#include "./vpx_config.h" 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/arm/idct_neon.h" 16#include "vpx_dsp/arm/mem_neon.h" 17#include "vpx_dsp/arm/transpose_neon.h" 18#include "vpx_dsp/txfm_common.h" 19 20static INLINE void load_from_transformed(const int16_t *const trans_buf, 21 const int first, const int second, 22 int16x8_t *const q0, 23 int16x8_t *const q1) { 24 *q0 = vld1q_s16(trans_buf + first * 8); 25 *q1 = vld1q_s16(trans_buf + second * 8); 26} 27 28static INLINE void load_from_output(const int16_t *const out, const int first, 29 const int second, int16x8_t *const q0, 30 int16x8_t *const q1) { 31 *q0 = vld1q_s16(out + first * 32); 32 *q1 = vld1q_s16(out + second * 32); 33} 34 35static INLINE void store_in_output(int16_t *const out, const int first, 36 const int second, const int16x8_t q0, 37 const int16x8_t q1) { 38 vst1q_s16(out + first * 32, q0); 39 vst1q_s16(out + second * 32, q1); 40} 41 42static INLINE void store_combine_results(uint8_t *p1, uint8_t *p2, 43 const int stride, int16x8_t q0, 44 int16x8_t q1, int16x8_t q2, 45 int16x8_t q3) { 46 uint8x8_t d[4]; 47 48 d[0] = vld1_u8(p1); 49 p1 += stride; 50 d[1] = vld1_u8(p1); 51 d[3] = vld1_u8(p2); 52 p2 -= stride; 53 d[2] = vld1_u8(p2); 54 55 q0 = vrshrq_n_s16(q0, 6); 56 q1 = vrshrq_n_s16(q1, 6); 57 q2 = vrshrq_n_s16(q2, 6); 58 q3 = vrshrq_n_s16(q3, 6); 59 60 q0 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q0), d[0])); 61 q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), d[1])); 62 q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), d[2])); 63 q3 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q3), d[3])); 64 65 d[0] = vqmovun_s16(q0); 66 d[1] = vqmovun_s16(q1); 67 d[2] = vqmovun_s16(q2); 68 d[3] = vqmovun_s16(q3); 69 70 vst1_u8(p1, d[1]); 71 p1 -= stride; 72 vst1_u8(p1, d[0]); 73 vst1_u8(p2, d[2]); 74 p2 += stride; 75 vst1_u8(p2, d[3]); 76} 77 78static INLINE void highbd_store_combine_results_bd8(uint16_t *p1, uint16_t *p2, 79 const int stride, 80 int16x8_t q0, int16x8_t q1, 81 int16x8_t q2, 82 int16x8_t q3) { 83 uint16x8_t d[4]; 84 85 d[0] = vld1q_u16(p1); 86 p1 += stride; 87 d[1] = vld1q_u16(p1); 88 d[3] = vld1q_u16(p2); 89 p2 -= stride; 90 d[2] = vld1q_u16(p2); 91 92 q0 = vrshrq_n_s16(q0, 6); 93 q1 = vrshrq_n_s16(q1, 6); 94 q2 = vrshrq_n_s16(q2, 6); 95 q3 = vrshrq_n_s16(q3, 6); 96 97 q0 = vaddq_s16(q0, vreinterpretq_s16_u16(d[0])); 98 q1 = vaddq_s16(q1, vreinterpretq_s16_u16(d[1])); 99 q2 = vaddq_s16(q2, vreinterpretq_s16_u16(d[2])); 100 q3 = vaddq_s16(q3, vreinterpretq_s16_u16(d[3])); 101 102 d[0] = vmovl_u8(vqmovun_s16(q0)); 103 d[1] = vmovl_u8(vqmovun_s16(q1)); 104 d[2] = vmovl_u8(vqmovun_s16(q2)); 105 d[3] = vmovl_u8(vqmovun_s16(q3)); 106 107 vst1q_u16(p1, d[1]); 108 p1 -= stride; 109 vst1q_u16(p1, d[0]); 110 vst1q_u16(p2, d[2]); 111 p2 += stride; 112 vst1q_u16(p2, d[3]); 113} 114 115static INLINE void do_butterfly(const int16x8_t qIn0, const int16x8_t qIn1, 116 const int16_t first_const, 117 const int16_t second_const, 118 int16x8_t *const qOut0, 119 int16x8_t *const qOut1) { 120 int32x4_t q[4]; 121 int16x4_t d[6]; 122 123 d[0] = vget_low_s16(qIn0); 124 d[1] = vget_high_s16(qIn0); 125 d[2] = vget_low_s16(qIn1); 126 d[3] = vget_high_s16(qIn1); 127 128 // Note: using v{mul, mla, mls}l_n_s16 here slows down 35% with gcc 4.9. 129 d[4] = vdup_n_s16(first_const); 130 d[5] = vdup_n_s16(second_const); 131 132 q[0] = vmull_s16(d[0], d[4]); 133 q[1] = vmull_s16(d[1], d[4]); 134 q[0] = vmlsl_s16(q[0], d[2], d[5]); 135 q[1] = vmlsl_s16(q[1], d[3], d[5]); 136 137 q[2] = vmull_s16(d[0], d[5]); 138 q[3] = vmull_s16(d[1], d[5]); 139 q[2] = vmlal_s16(q[2], d[2], d[4]); 140 q[3] = vmlal_s16(q[3], d[3], d[4]); 141 142 *qOut0 = vcombine_s16(vrshrn_n_s32(q[0], DCT_CONST_BITS), 143 vrshrn_n_s32(q[1], DCT_CONST_BITS)); 144 *qOut1 = vcombine_s16(vrshrn_n_s32(q[2], DCT_CONST_BITS), 145 vrshrn_n_s32(q[3], DCT_CONST_BITS)); 146} 147 148static INLINE void load_s16x8q(const int16_t *in, int16x8_t *const s0, 149 int16x8_t *const s1, int16x8_t *const s2, 150 int16x8_t *const s3, int16x8_t *const s4, 151 int16x8_t *const s5, int16x8_t *const s6, 152 int16x8_t *const s7) { 153 *s0 = vld1q_s16(in); 154 in += 32; 155 *s1 = vld1q_s16(in); 156 in += 32; 157 *s2 = vld1q_s16(in); 158 in += 32; 159 *s3 = vld1q_s16(in); 160 in += 32; 161 *s4 = vld1q_s16(in); 162 in += 32; 163 *s5 = vld1q_s16(in); 164 in += 32; 165 *s6 = vld1q_s16(in); 166 in += 32; 167 *s7 = vld1q_s16(in); 168} 169 170static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1, 171 int16x8_t a2, int16x8_t a3, 172 int16x8_t a4, int16x8_t a5, 173 int16x8_t a6, int16x8_t a7, 174 int16_t **out) { 175 transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); 176 177 vst1q_s16(*out, a0); 178 *out += 8; 179 vst1q_s16(*out, a1); 180 *out += 8; 181 vst1q_s16(*out, a2); 182 *out += 8; 183 vst1q_s16(*out, a3); 184 *out += 8; 185 vst1q_s16(*out, a4); 186 *out += 8; 187 vst1q_s16(*out, a5); 188 *out += 8; 189 vst1q_s16(*out, a6); 190 *out += 8; 191 vst1q_s16(*out, a7); 192 *out += 8; 193} 194 195static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) { 196 int i; 197 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; 198 199 for (i = 0; i < 4; i++, input += 8) { 200 load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); 201 transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf); 202 } 203} 204 205#if CONFIG_VP9_HIGHBITDEPTH 206static INLINE void load_s16x8q_tran_low( 207 const tran_low_t *in, int16x8_t *const s0, int16x8_t *const s1, 208 int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4, 209 int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7) { 210 *s0 = load_tran_low_to_s16q(in); 211 in += 32; 212 *s1 = load_tran_low_to_s16q(in); 213 in += 32; 214 *s2 = load_tran_low_to_s16q(in); 215 in += 32; 216 *s3 = load_tran_low_to_s16q(in); 217 in += 32; 218 *s4 = load_tran_low_to_s16q(in); 219 in += 32; 220 *s5 = load_tran_low_to_s16q(in); 221 in += 32; 222 *s6 = load_tran_low_to_s16q(in); 223 in += 32; 224 *s7 = load_tran_low_to_s16q(in); 225} 226 227static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input, 228 int16_t *t_buf) { 229 int i; 230 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; 231 232 for (i = 0; i < 4; i++, input += 8) { 233 load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); 234 transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf); 235 } 236} 237#else // !CONFIG_VP9_HIGHBITDEPTH 238#define idct32_transpose_pair_tran_low idct32_transpose_pair 239#endif // CONFIG_VP9_HIGHBITDEPTH 240 241static INLINE void idct32_bands_end_1st_pass(int16_t *const out, 242 int16x8_t *const q) { 243 store_in_output(out, 16, 17, q[6], q[7]); 244 store_in_output(out, 14, 15, q[8], q[9]); 245 246 load_from_output(out, 30, 31, &q[0], &q[1]); 247 q[4] = vaddq_s16(q[2], q[1]); 248 q[5] = vaddq_s16(q[3], q[0]); 249 q[6] = vsubq_s16(q[3], q[0]); 250 q[7] = vsubq_s16(q[2], q[1]); 251 store_in_output(out, 30, 31, q[6], q[7]); 252 store_in_output(out, 0, 1, q[4], q[5]); 253 254 load_from_output(out, 12, 13, &q[0], &q[1]); 255 q[2] = vaddq_s16(q[10], q[1]); 256 q[3] = vaddq_s16(q[11], q[0]); 257 q[4] = vsubq_s16(q[11], q[0]); 258 q[5] = vsubq_s16(q[10], q[1]); 259 260 load_from_output(out, 18, 19, &q[0], &q[1]); 261 q[8] = vaddq_s16(q[4], q[1]); 262 q[9] = vaddq_s16(q[5], q[0]); 263 q[6] = vsubq_s16(q[5], q[0]); 264 q[7] = vsubq_s16(q[4], q[1]); 265 store_in_output(out, 18, 19, q[6], q[7]); 266 store_in_output(out, 12, 13, q[8], q[9]); 267 268 load_from_output(out, 28, 29, &q[0], &q[1]); 269 q[4] = vaddq_s16(q[2], q[1]); 270 q[5] = vaddq_s16(q[3], q[0]); 271 q[6] = vsubq_s16(q[3], q[0]); 272 q[7] = vsubq_s16(q[2], q[1]); 273 store_in_output(out, 28, 29, q[6], q[7]); 274 store_in_output(out, 2, 3, q[4], q[5]); 275 276 load_from_output(out, 10, 11, &q[0], &q[1]); 277 q[2] = vaddq_s16(q[12], q[1]); 278 q[3] = vaddq_s16(q[13], q[0]); 279 q[4] = vsubq_s16(q[13], q[0]); 280 q[5] = vsubq_s16(q[12], q[1]); 281 282 load_from_output(out, 20, 21, &q[0], &q[1]); 283 q[8] = vaddq_s16(q[4], q[1]); 284 q[9] = vaddq_s16(q[5], q[0]); 285 q[6] = vsubq_s16(q[5], q[0]); 286 q[7] = vsubq_s16(q[4], q[1]); 287 store_in_output(out, 20, 21, q[6], q[7]); 288 store_in_output(out, 10, 11, q[8], q[9]); 289 290 load_from_output(out, 26, 27, &q[0], &q[1]); 291 q[4] = vaddq_s16(q[2], q[1]); 292 q[5] = vaddq_s16(q[3], q[0]); 293 q[6] = vsubq_s16(q[3], q[0]); 294 q[7] = vsubq_s16(q[2], q[1]); 295 store_in_output(out, 26, 27, q[6], q[7]); 296 store_in_output(out, 4, 5, q[4], q[5]); 297 298 load_from_output(out, 8, 9, &q[0], &q[1]); 299 q[2] = vaddq_s16(q[14], q[1]); 300 q[3] = vaddq_s16(q[15], q[0]); 301 q[4] = vsubq_s16(q[15], q[0]); 302 q[5] = vsubq_s16(q[14], q[1]); 303 304 load_from_output(out, 22, 23, &q[0], &q[1]); 305 q[8] = vaddq_s16(q[4], q[1]); 306 q[9] = vaddq_s16(q[5], q[0]); 307 q[6] = vsubq_s16(q[5], q[0]); 308 q[7] = vsubq_s16(q[4], q[1]); 309 store_in_output(out, 22, 23, q[6], q[7]); 310 store_in_output(out, 8, 9, q[8], q[9]); 311 312 load_from_output(out, 24, 25, &q[0], &q[1]); 313 q[4] = vaddq_s16(q[2], q[1]); 314 q[5] = vaddq_s16(q[3], q[0]); 315 q[6] = vsubq_s16(q[3], q[0]); 316 q[7] = vsubq_s16(q[2], q[1]); 317 store_in_output(out, 24, 25, q[6], q[7]); 318 store_in_output(out, 6, 7, q[4], q[5]); 319} 320 321static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out, 322 uint8_t *const dest, 323 const int stride, 324 int16x8_t *const q) { 325 uint8_t *dest0 = dest + 0 * stride; 326 uint8_t *dest1 = dest + 31 * stride; 327 uint8_t *dest2 = dest + 16 * stride; 328 uint8_t *dest3 = dest + 15 * stride; 329 const int str2 = stride << 1; 330 331 store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); 332 dest2 += str2; 333 dest3 -= str2; 334 335 load_from_output(out, 30, 31, &q[0], &q[1]); 336 q[4] = final_add(q[2], q[1]); 337 q[5] = final_add(q[3], q[0]); 338 q[6] = final_sub(q[3], q[0]); 339 q[7] = final_sub(q[2], q[1]); 340 store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); 341 dest0 += str2; 342 dest1 -= str2; 343 344 load_from_output(out, 12, 13, &q[0], &q[1]); 345 q[2] = vaddq_s16(q[10], q[1]); 346 q[3] = vaddq_s16(q[11], q[0]); 347 q[4] = vsubq_s16(q[11], q[0]); 348 q[5] = vsubq_s16(q[10], q[1]); 349 350 load_from_output(out, 18, 19, &q[0], &q[1]); 351 q[8] = final_add(q[4], q[1]); 352 q[9] = final_add(q[5], q[0]); 353 q[6] = final_sub(q[5], q[0]); 354 q[7] = final_sub(q[4], q[1]); 355 store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); 356 dest2 += str2; 357 dest3 -= str2; 358 359 load_from_output(out, 28, 29, &q[0], &q[1]); 360 q[4] = final_add(q[2], q[1]); 361 q[5] = final_add(q[3], q[0]); 362 q[6] = final_sub(q[3], q[0]); 363 q[7] = final_sub(q[2], q[1]); 364 store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); 365 dest0 += str2; 366 dest1 -= str2; 367 368 load_from_output(out, 10, 11, &q[0], &q[1]); 369 q[2] = vaddq_s16(q[12], q[1]); 370 q[3] = vaddq_s16(q[13], q[0]); 371 q[4] = vsubq_s16(q[13], q[0]); 372 q[5] = vsubq_s16(q[12], q[1]); 373 374 load_from_output(out, 20, 21, &q[0], &q[1]); 375 q[8] = final_add(q[4], q[1]); 376 q[9] = final_add(q[5], q[0]); 377 q[6] = final_sub(q[5], q[0]); 378 q[7] = final_sub(q[4], q[1]); 379 store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); 380 dest2 += str2; 381 dest3 -= str2; 382 383 load_from_output(out, 26, 27, &q[0], &q[1]); 384 q[4] = final_add(q[2], q[1]); 385 q[5] = final_add(q[3], q[0]); 386 q[6] = final_sub(q[3], q[0]); 387 q[7] = final_sub(q[2], q[1]); 388 store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); 389 dest0 += str2; 390 dest1 -= str2; 391 392 load_from_output(out, 8, 9, &q[0], &q[1]); 393 q[2] = vaddq_s16(q[14], q[1]); 394 q[3] = vaddq_s16(q[15], q[0]); 395 q[4] = vsubq_s16(q[15], q[0]); 396 q[5] = vsubq_s16(q[14], q[1]); 397 398 load_from_output(out, 22, 23, &q[0], &q[1]); 399 q[8] = final_add(q[4], q[1]); 400 q[9] = final_add(q[5], q[0]); 401 q[6] = final_sub(q[5], q[0]); 402 q[7] = final_sub(q[4], q[1]); 403 store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); 404 405 load_from_output(out, 24, 25, &q[0], &q[1]); 406 q[4] = final_add(q[2], q[1]); 407 q[5] = final_add(q[3], q[0]); 408 q[6] = final_sub(q[3], q[0]); 409 q[7] = final_sub(q[2], q[1]); 410 store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); 411} 412 413static INLINE void highbd_idct32_bands_end_2nd_pass_bd8( 414 const int16_t *const out, uint16_t *const dest, const int stride, 415 int16x8_t *const q) { 416 uint16_t *dest0 = dest + 0 * stride; 417 uint16_t *dest1 = dest + 31 * stride; 418 uint16_t *dest2 = dest + 16 * stride; 419 uint16_t *dest3 = dest + 15 * stride; 420 const int str2 = stride << 1; 421 422 highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], 423 q[9]); 424 dest2 += str2; 425 dest3 -= str2; 426 427 load_from_output(out, 30, 31, &q[0], &q[1]); 428 q[4] = final_add(q[2], q[1]); 429 q[5] = final_add(q[3], q[0]); 430 q[6] = final_sub(q[3], q[0]); 431 q[7] = final_sub(q[2], q[1]); 432 highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], 433 q[7]); 434 dest0 += str2; 435 dest1 -= str2; 436 437 load_from_output(out, 12, 13, &q[0], &q[1]); 438 q[2] = vaddq_s16(q[10], q[1]); 439 q[3] = vaddq_s16(q[11], q[0]); 440 q[4] = vsubq_s16(q[11], q[0]); 441 q[5] = vsubq_s16(q[10], q[1]); 442 443 load_from_output(out, 18, 19, &q[0], &q[1]); 444 q[8] = final_add(q[4], q[1]); 445 q[9] = final_add(q[5], q[0]); 446 q[6] = final_sub(q[5], q[0]); 447 q[7] = final_sub(q[4], q[1]); 448 highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], 449 q[9]); 450 dest2 += str2; 451 dest3 -= str2; 452 453 load_from_output(out, 28, 29, &q[0], &q[1]); 454 q[4] = final_add(q[2], q[1]); 455 q[5] = final_add(q[3], q[0]); 456 q[6] = final_sub(q[3], q[0]); 457 q[7] = final_sub(q[2], q[1]); 458 highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], 459 q[7]); 460 dest0 += str2; 461 dest1 -= str2; 462 463 load_from_output(out, 10, 11, &q[0], &q[1]); 464 q[2] = vaddq_s16(q[12], q[1]); 465 q[3] = vaddq_s16(q[13], q[0]); 466 q[4] = vsubq_s16(q[13], q[0]); 467 q[5] = vsubq_s16(q[12], q[1]); 468 469 load_from_output(out, 20, 21, &q[0], &q[1]); 470 q[8] = final_add(q[4], q[1]); 471 q[9] = final_add(q[5], q[0]); 472 q[6] = final_sub(q[5], q[0]); 473 q[7] = final_sub(q[4], q[1]); 474 highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], 475 q[9]); 476 dest2 += str2; 477 dest3 -= str2; 478 479 load_from_output(out, 26, 27, &q[0], &q[1]); 480 q[4] = final_add(q[2], q[1]); 481 q[5] = final_add(q[3], q[0]); 482 q[6] = final_sub(q[3], q[0]); 483 q[7] = final_sub(q[2], q[1]); 484 highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], 485 q[7]); 486 dest0 += str2; 487 dest1 -= str2; 488 489 load_from_output(out, 8, 9, &q[0], &q[1]); 490 q[2] = vaddq_s16(q[14], q[1]); 491 q[3] = vaddq_s16(q[15], q[0]); 492 q[4] = vsubq_s16(q[15], q[0]); 493 q[5] = vsubq_s16(q[14], q[1]); 494 495 load_from_output(out, 22, 23, &q[0], &q[1]); 496 q[8] = final_add(q[4], q[1]); 497 q[9] = final_add(q[5], q[0]); 498 q[6] = final_sub(q[5], q[0]); 499 q[7] = final_sub(q[4], q[1]); 500 highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], 501 q[9]); 502 503 load_from_output(out, 24, 25, &q[0], &q[1]); 504 q[4] = final_add(q[2], q[1]); 505 q[5] = final_add(q[3], q[0]); 506 q[6] = final_sub(q[3], q[0]); 507 q[7] = final_sub(q[2], q[1]); 508 highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], 509 q[7]); 510} 511 512void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest, 513 const int stride, const int highbd_flag) { 514 int i, idct32_pass_loop; 515 int16_t trans_buf[32 * 8]; 516 int16_t pass1[32 * 32]; 517 int16_t pass2[32 * 32]; 518 const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1 519 int16_t *out; 520 int16x8_t q[16]; 521 uint16_t *dst = CAST_TO_SHORTPTR(dest); 522 523 for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; 524 idct32_pass_loop++, out = pass2) { 525 for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop 526 if (idct32_pass_loop == 0) { 527 idct32_transpose_pair_tran_low(input, trans_buf); 528 input += 32 * 8; 529 } else { 530 idct32_transpose_pair(input_pass2, trans_buf); 531 input_pass2 += 32 * 8; 532 } 533 534 // ----------------------------------------- 535 // BLOCK A: 16-19,28-31 536 // ----------------------------------------- 537 // generate 16,17,30,31 538 // part of stage 1 539 load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]); 540 do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]); 541 load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]); 542 do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]); 543 // part of stage 2 544 q[4] = vaddq_s16(q[0], q[1]); 545 q[13] = vsubq_s16(q[0], q[1]); 546 q[6] = vaddq_s16(q[2], q[3]); 547 q[14] = vsubq_s16(q[2], q[3]); 548 // part of stage 3 549 do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]); 550 551 // generate 18,19,28,29 552 // part of stage 1 553 load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]); 554 do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]); 555 load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]); 556 do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]); 557 // part of stage 2 558 q[13] = vsubq_s16(q[3], q[2]); 559 q[3] = vaddq_s16(q[3], q[2]); 560 q[14] = vsubq_s16(q[1], q[0]); 561 q[2] = vaddq_s16(q[1], q[0]); 562 // part of stage 3 563 do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]); 564 // part of stage 4 565 q[8] = vaddq_s16(q[4], q[2]); 566 q[9] = vaddq_s16(q[5], q[0]); 567 q[10] = vaddq_s16(q[7], q[1]); 568 q[15] = vaddq_s16(q[6], q[3]); 569 q[13] = vsubq_s16(q[5], q[0]); 570 q[14] = vsubq_s16(q[7], q[1]); 571 store_in_output(out, 16, 31, q[8], q[15]); 572 store_in_output(out, 17, 30, q[9], q[10]); 573 // part of stage 5 574 do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]); 575 store_in_output(out, 29, 18, q[1], q[0]); 576 // part of stage 4 577 q[13] = vsubq_s16(q[4], q[2]); 578 q[14] = vsubq_s16(q[6], q[3]); 579 // part of stage 5 580 do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]); 581 store_in_output(out, 19, 28, q[4], q[6]); 582 583 // ----------------------------------------- 584 // BLOCK B: 20-23,24-27 585 // ----------------------------------------- 586 // generate 20,21,26,27 587 // part of stage 1 588 load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]); 589 do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]); 590 load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]); 591 do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]); 592 // part of stage 2 593 q[13] = vsubq_s16(q[0], q[1]); 594 q[0] = vaddq_s16(q[0], q[1]); 595 q[14] = vsubq_s16(q[2], q[3]); 596 q[2] = vaddq_s16(q[2], q[3]); 597 // part of stage 3 598 do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); 599 600 // generate 22,23,24,25 601 // part of stage 1 602 load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]); 603 do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]); 604 load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]); 605 do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]); 606 // part of stage 2 607 q[14] = vsubq_s16(q[4], q[5]); 608 q[5] = vaddq_s16(q[4], q[5]); 609 q[13] = vsubq_s16(q[6], q[7]); 610 q[6] = vaddq_s16(q[6], q[7]); 611 // part of stage 3 612 do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]); 613 // part of stage 4 614 q[10] = vaddq_s16(q[7], q[1]); 615 q[11] = vaddq_s16(q[5], q[0]); 616 q[12] = vaddq_s16(q[6], q[2]); 617 q[15] = vaddq_s16(q[4], q[3]); 618 // part of stage 6 619 load_from_output(out, 16, 17, &q[14], &q[13]); 620 q[8] = vaddq_s16(q[14], q[11]); 621 q[9] = vaddq_s16(q[13], q[10]); 622 q[13] = vsubq_s16(q[13], q[10]); 623 q[11] = vsubq_s16(q[14], q[11]); 624 store_in_output(out, 17, 16, q[9], q[8]); 625 load_from_output(out, 30, 31, &q[14], &q[9]); 626 q[8] = vsubq_s16(q[9], q[12]); 627 q[10] = vaddq_s16(q[14], q[15]); 628 q[14] = vsubq_s16(q[14], q[15]); 629 q[12] = vaddq_s16(q[9], q[12]); 630 store_in_output(out, 30, 31, q[10], q[12]); 631 // part of stage 7 632 do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); 633 store_in_output(out, 25, 22, q[14], q[13]); 634 do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]); 635 store_in_output(out, 24, 23, q[14], q[13]); 636 // part of stage 4 637 q[14] = vsubq_s16(q[5], q[0]); 638 q[13] = vsubq_s16(q[6], q[2]); 639 do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]); 640 q[14] = vsubq_s16(q[7], q[1]); 641 q[13] = vsubq_s16(q[4], q[3]); 642 do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]); 643 // part of stage 6 644 load_from_output(out, 18, 19, &q[14], &q[13]); 645 q[8] = vaddq_s16(q[14], q[1]); 646 q[9] = vaddq_s16(q[13], q[6]); 647 q[13] = vsubq_s16(q[13], q[6]); 648 q[1] = vsubq_s16(q[14], q[1]); 649 store_in_output(out, 18, 19, q[8], q[9]); 650 load_from_output(out, 28, 29, &q[8], &q[9]); 651 q[14] = vsubq_s16(q[8], q[5]); 652 q[10] = vaddq_s16(q[8], q[5]); 653 q[11] = vaddq_s16(q[9], q[0]); 654 q[0] = vsubq_s16(q[9], q[0]); 655 store_in_output(out, 28, 29, q[10], q[11]); 656 // part of stage 7 657 do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); 658 store_in_output(out, 20, 27, q[13], q[14]); 659 do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]); 660 store_in_output(out, 21, 26, q[1], q[0]); 661 662 // ----------------------------------------- 663 // BLOCK C: 8-10,11-15 664 // ----------------------------------------- 665 // generate 8,9,14,15 666 // part of stage 2 667 load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]); 668 do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]); 669 load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]); 670 do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]); 671 // part of stage 3 672 q[13] = vsubq_s16(q[0], q[1]); 673 q[0] = vaddq_s16(q[0], q[1]); 674 q[14] = vsubq_s16(q[2], q[3]); 675 q[2] = vaddq_s16(q[2], q[3]); 676 // part of stage 4 677 do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]); 678 679 // generate 10,11,12,13 680 // part of stage 2 681 load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]); 682 do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]); 683 load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]); 684 do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]); 685 // part of stage 3 686 q[14] = vsubq_s16(q[4], q[5]); 687 q[5] = vaddq_s16(q[4], q[5]); 688 q[13] = vsubq_s16(q[6], q[7]); 689 q[6] = vaddq_s16(q[6], q[7]); 690 // part of stage 4 691 do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]); 692 // part of stage 5 693 q[8] = vaddq_s16(q[0], q[5]); 694 q[9] = vaddq_s16(q[1], q[7]); 695 q[13] = vsubq_s16(q[1], q[7]); 696 q[14] = vsubq_s16(q[3], q[4]); 697 q[10] = vaddq_s16(q[3], q[4]); 698 q[15] = vaddq_s16(q[2], q[6]); 699 store_in_output(out, 8, 15, q[8], q[15]); 700 store_in_output(out, 9, 14, q[9], q[10]); 701 // part of stage 6 702 do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); 703 store_in_output(out, 13, 10, q[3], q[1]); 704 q[13] = vsubq_s16(q[0], q[5]); 705 q[14] = vsubq_s16(q[2], q[6]); 706 do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); 707 store_in_output(out, 11, 12, q[1], q[3]); 708 709 // ----------------------------------------- 710 // BLOCK D: 0-3,4-7 711 // ----------------------------------------- 712 // generate 4,5,6,7 713 // part of stage 3 714 load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]); 715 do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]); 716 load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]); 717 do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); 718 // part of stage 4 719 q[13] = vsubq_s16(q[0], q[1]); 720 q[0] = vaddq_s16(q[0], q[1]); 721 q[14] = vsubq_s16(q[2], q[3]); 722 q[2] = vaddq_s16(q[2], q[3]); 723 // part of stage 5 724 do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); 725 726 // generate 0,1,2,3 727 // part of stage 4 728 load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]); 729 do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]); 730 load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]); 731 do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]); 732 // part of stage 5 733 q[4] = vaddq_s16(q[7], q[6]); 734 q[7] = vsubq_s16(q[7], q[6]); 735 q[6] = vsubq_s16(q[5], q[14]); 736 q[5] = vaddq_s16(q[5], q[14]); 737 // part of stage 6 738 q[8] = vaddq_s16(q[4], q[2]); 739 q[9] = vaddq_s16(q[5], q[3]); 740 q[10] = vaddq_s16(q[6], q[1]); 741 q[11] = vaddq_s16(q[7], q[0]); 742 q[12] = vsubq_s16(q[7], q[0]); 743 q[13] = vsubq_s16(q[6], q[1]); 744 q[14] = vsubq_s16(q[5], q[3]); 745 q[15] = vsubq_s16(q[4], q[2]); 746 // part of stage 7 747 load_from_output(out, 14, 15, &q[0], &q[1]); 748 q[2] = vaddq_s16(q[8], q[1]); 749 q[3] = vaddq_s16(q[9], q[0]); 750 q[4] = vsubq_s16(q[9], q[0]); 751 q[5] = vsubq_s16(q[8], q[1]); 752 load_from_output(out, 16, 17, &q[0], &q[1]); 753 q[8] = final_add(q[4], q[1]); 754 q[9] = final_add(q[5], q[0]); 755 q[6] = final_sub(q[5], q[0]); 756 q[7] = final_sub(q[4], q[1]); 757 758 if (idct32_pass_loop == 0) { 759 idct32_bands_end_1st_pass(out, q); 760 } else { 761 if (highbd_flag) { 762 highbd_idct32_bands_end_2nd_pass_bd8(out, dst, stride, q); 763 dst += 8; 764 } else { 765 idct32_bands_end_2nd_pass(out, dest, stride, q); 766 dest += 8; 767 } 768 } 769 } 770 } 771} 772 773void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, 774 int stride) { 775 vpx_idct32_32_neon(input, dest, stride, 0); 776} 777