1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13#include "./vpx_config.h" 14#include "vpx_dsp/txfm_common.h" 15 16#define LOAD_FROM_TRANSPOSED(prev, first, second) \ 17 q14s16 = vld1q_s16(trans_buf + first * 8); \ 18 q13s16 = vld1q_s16(trans_buf + second * 8); 19 20#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ 21 qA = vld1q_s16(out + first * 32); \ 22 qB = vld1q_s16(out + second * 32); 23 24#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ 25 vst1q_s16(out + first * 32, qA); \ 26 vst1q_s16(out + second * 32, qB); 27 28#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ 29 __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \ 30 q6s16, q7s16, q8s16, q9s16); 31static INLINE void __STORE_COMBINE_CENTER_RESULTS( 32 uint8_t *p1, 33 uint8_t *p2, 34 int stride, 35 int16x8_t q6s16, 36 int16x8_t q7s16, 37 int16x8_t q8s16, 38 int16x8_t q9s16) { 39 int16x4_t d8s16, d9s16, d10s16, d11s16; 40 41 d8s16 = vld1_s16((int16_t *)p1); 42 p1 += stride; 43 d11s16 = vld1_s16((int16_t *)p2); 44 p2 -= stride; 45 d9s16 = vld1_s16((int16_t *)p1); 46 d10s16 = vld1_s16((int16_t *)p2); 47 48 q7s16 = vrshrq_n_s16(q7s16, 6); 49 q8s16 = vrshrq_n_s16(q8s16, 6); 50 q9s16 = vrshrq_n_s16(q9s16, 6); 51 q6s16 = vrshrq_n_s16(q6s16, 6); 52 53 q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), 54 vreinterpret_u8_s16(d9s16))); 55 q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16), 56 vreinterpret_u8_s16(d10s16))); 57 q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16), 58 vreinterpret_u8_s16(d11s16))); 59 q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), 60 vreinterpret_u8_s16(d8s16))); 61 62 d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); 63 d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); 64 d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); 65 d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); 66 67 vst1_s16((int16_t *)p1, d9s16); 68 p1 -= stride; 69 vst1_s16((int16_t *)p2, d10s16); 70 p2 += stride; 71 vst1_s16((int16_t *)p1, d8s16); 72 vst1_s16((int16_t *)p2, d11s16); 73 return; 74} 75 76#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \ 77 __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \ 78 q4s16, q5s16, q6s16, q7s16); 79static INLINE void __STORE_COMBINE_EXTREME_RESULTS( 80 uint8_t *p1, 81 uint8_t *p2, 82 int stride, 83 int16x8_t q4s16, 84 int16x8_t q5s16, 85 int16x8_t q6s16, 86 int16x8_t q7s16) { 87 int16x4_t d4s16, d5s16, d6s16, d7s16; 88 89 d4s16 = vld1_s16((int16_t *)p1); 90 p1 += stride; 91 d7s16 = vld1_s16((int16_t *)p2); 92 p2 -= stride; 93 d5s16 = vld1_s16((int16_t *)p1); 94 d6s16 = vld1_s16((int16_t *)p2); 95 96 q5s16 = vrshrq_n_s16(q5s16, 6); 97 q6s16 = vrshrq_n_s16(q6s16, 6); 98 q7s16 = vrshrq_n_s16(q7s16, 6); 99 q4s16 = vrshrq_n_s16(q4s16, 6); 100 101 q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16), 102 vreinterpret_u8_s16(d5s16))); 103 q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), 104 vreinterpret_u8_s16(d6s16))); 105 q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), 106 vreinterpret_u8_s16(d7s16))); 107 q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16), 108 vreinterpret_u8_s16(d4s16))); 109 110 d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); 111 d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); 112 d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); 113 d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); 114 115 vst1_s16((int16_t *)p1, d5s16); 116 p1 -= stride; 117 vst1_s16((int16_t *)p2, d6s16); 118 p2 += stride; 119 vst1_s16((int16_t *)p2, d7s16); 120 vst1_s16((int16_t *)p1, d4s16); 121 return; 122} 123 124#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ 125 DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); 126static INLINE void DO_BUTTERFLY( 127 int16x8_t q14s16, 128 int16x8_t q13s16, 129 int16_t first_const, 130 int16_t second_const, 131 int16x8_t *qAs16, 132 int16x8_t *qBs16) { 133 int16x4_t d30s16, d31s16; 134 int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; 135 int16x4_t dCs16, dDs16, dAs16, dBs16; 136 137 dCs16 = vget_low_s16(q14s16); 138 dDs16 = vget_high_s16(q14s16); 139 dAs16 = vget_low_s16(q13s16); 140 dBs16 = vget_high_s16(q13s16); 141 142 d30s16 = vdup_n_s16(first_const); 143 d31s16 = vdup_n_s16(second_const); 144 145 q8s32 = vmull_s16(dCs16, d30s16); 146 q10s32 = vmull_s16(dAs16, d31s16); 147 q9s32 = vmull_s16(dDs16, d30s16); 148 q11s32 = vmull_s16(dBs16, d31s16); 149 q12s32 = vmull_s16(dCs16, d31s16); 150 151 q8s32 = vsubq_s32(q8s32, q10s32); 152 q9s32 = vsubq_s32(q9s32, q11s32); 153 154 q10s32 = vmull_s16(dDs16, d31s16); 155 q11s32 = vmull_s16(dAs16, d30s16); 156 q15s32 = vmull_s16(dBs16, d30s16); 157 158 q11s32 = vaddq_s32(q12s32, q11s32); 159 q10s32 = vaddq_s32(q10s32, q15s32); 160 161 *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), 162 vqrshrn_n_s32(q9s32, 14)); 163 *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), 164 vqrshrn_n_s32(q10s32, 14)); 165 return; 166} 167 168static INLINE void idct32_transpose_pair( 169 int16_t *input, 170 int16_t *t_buf) { 171 int16_t *in; 172 int i; 173 const int stride = 32; 174 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; 175 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; 176 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; 177 int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; 178 int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; 179 180 for (i = 0; i < 4; i++, input += 8) { 181 in = input; 182 q8s16 = vld1q_s16(in); 183 in += stride; 184 q9s16 = vld1q_s16(in); 185 in += stride; 186 q10s16 = vld1q_s16(in); 187 in += stride; 188 q11s16 = vld1q_s16(in); 189 in += stride; 190 q12s16 = vld1q_s16(in); 191 in += stride; 192 q13s16 = vld1q_s16(in); 193 in += stride; 194 q14s16 = vld1q_s16(in); 195 in += stride; 196 q15s16 = vld1q_s16(in); 197 198 d16s16 = vget_low_s16(q8s16); 199 d17s16 = vget_high_s16(q8s16); 200 d18s16 = vget_low_s16(q9s16); 201 d19s16 = vget_high_s16(q9s16); 202 d20s16 = vget_low_s16(q10s16); 203 d21s16 = vget_high_s16(q10s16); 204 d22s16 = vget_low_s16(q11s16); 205 d23s16 = vget_high_s16(q11s16); 206 d24s16 = vget_low_s16(q12s16); 207 d25s16 = vget_high_s16(q12s16); 208 d26s16 = vget_low_s16(q13s16); 209 d27s16 = vget_high_s16(q13s16); 210 d28s16 = vget_low_s16(q14s16); 211 d29s16 = vget_high_s16(q14s16); 212 d30s16 = vget_low_s16(q15s16); 213 d31s16 = vget_high_s16(q15s16); 214 215 q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 216 q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 217 q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 218 q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 219 q12s16 = vcombine_s16(d17s16, d25s16); 220 q13s16 = vcombine_s16(d19s16, d27s16); 221 q14s16 = vcombine_s16(d21s16, d29s16); 222 q15s16 = vcombine_s16(d23s16, d31s16); 223 224 q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), 225 vreinterpretq_s32_s16(q10s16)); 226 q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16), 227 vreinterpretq_s32_s16(q11s16)); 228 q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16), 229 vreinterpretq_s32_s16(q14s16)); 230 q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16), 231 vreinterpretq_s32_s16(q15s16)); 232 233 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 234 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 235 q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 236 vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 237 q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 238 vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 239 q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 240 vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 241 242 vst1q_s16(t_buf, q0x2s16.val[0]); 243 t_buf += 8; 244 vst1q_s16(t_buf, q0x2s16.val[1]); 245 t_buf += 8; 246 vst1q_s16(t_buf, q1x2s16.val[0]); 247 t_buf += 8; 248 vst1q_s16(t_buf, q1x2s16.val[1]); 249 t_buf += 8; 250 vst1q_s16(t_buf, q2x2s16.val[0]); 251 t_buf += 8; 252 vst1q_s16(t_buf, q2x2s16.val[1]); 253 t_buf += 8; 254 vst1q_s16(t_buf, q3x2s16.val[0]); 255 t_buf += 8; 256 vst1q_s16(t_buf, q3x2s16.val[1]); 257 t_buf += 8; 258 } 259 return; 260} 261 262static INLINE void idct32_bands_end_1st_pass( 263 int16_t *out, 264 int16x8_t q2s16, 265 int16x8_t q3s16, 266 int16x8_t q6s16, 267 int16x8_t q7s16, 268 int16x8_t q8s16, 269 int16x8_t q9s16, 270 int16x8_t q10s16, 271 int16x8_t q11s16, 272 int16x8_t q12s16, 273 int16x8_t q13s16, 274 int16x8_t q14s16, 275 int16x8_t q15s16) { 276 int16x8_t q0s16, q1s16, q4s16, q5s16; 277 278 STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); 279 STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); 280 281 LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); 282 q4s16 = vaddq_s16(q2s16, q1s16); 283 q5s16 = vaddq_s16(q3s16, q0s16); 284 q6s16 = vsubq_s16(q3s16, q0s16); 285 q7s16 = vsubq_s16(q2s16, q1s16); 286 STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); 287 STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); 288 289 LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); 290 q2s16 = vaddq_s16(q10s16, q1s16); 291 q3s16 = vaddq_s16(q11s16, q0s16); 292 q4s16 = vsubq_s16(q11s16, q0s16); 293 q5s16 = vsubq_s16(q10s16, q1s16); 294 295 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); 296 q8s16 = vaddq_s16(q4s16, q1s16); 297 q9s16 = vaddq_s16(q5s16, q0s16); 298 q6s16 = vsubq_s16(q5s16, q0s16); 299 q7s16 = vsubq_s16(q4s16, q1s16); 300 STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); 301 STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); 302 303 LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); 304 q4s16 = vaddq_s16(q2s16, q1s16); 305 q5s16 = vaddq_s16(q3s16, q0s16); 306 q6s16 = vsubq_s16(q3s16, q0s16); 307 q7s16 = vsubq_s16(q2s16, q1s16); 308 STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); 309 STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); 310 311 LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); 312 q2s16 = vaddq_s16(q12s16, q1s16); 313 q3s16 = vaddq_s16(q13s16, q0s16); 314 q4s16 = vsubq_s16(q13s16, q0s16); 315 q5s16 = vsubq_s16(q12s16, q1s16); 316 317 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); 318 q8s16 = vaddq_s16(q4s16, q1s16); 319 q9s16 = vaddq_s16(q5s16, q0s16); 320 q6s16 = vsubq_s16(q5s16, q0s16); 321 q7s16 = vsubq_s16(q4s16, q1s16); 322 STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); 323 STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); 324 325 LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); 326 q4s16 = vaddq_s16(q2s16, q1s16); 327 q5s16 = vaddq_s16(q3s16, q0s16); 328 q6s16 = vsubq_s16(q3s16, q0s16); 329 q7s16 = vsubq_s16(q2s16, q1s16); 330 STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); 331 STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); 332 333 LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); 334 q2s16 = vaddq_s16(q14s16, q1s16); 335 q3s16 = vaddq_s16(q15s16, q0s16); 336 q4s16 = vsubq_s16(q15s16, q0s16); 337 q5s16 = vsubq_s16(q14s16, q1s16); 338 339 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); 340 q8s16 = vaddq_s16(q4s16, q1s16); 341 q9s16 = vaddq_s16(q5s16, q0s16); 342 q6s16 = vsubq_s16(q5s16, q0s16); 343 q7s16 = vsubq_s16(q4s16, q1s16); 344 STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); 345 STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); 346 347 LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); 348 q4s16 = vaddq_s16(q2s16, q1s16); 349 q5s16 = vaddq_s16(q3s16, q0s16); 350 q6s16 = vsubq_s16(q3s16, q0s16); 351 q7s16 = vsubq_s16(q2s16, q1s16); 352 STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); 353 STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); 354 return; 355} 356 357static INLINE void idct32_bands_end_2nd_pass( 358 int16_t *out, 359 uint8_t *dest, 360 int stride, 361 int16x8_t q2s16, 362 int16x8_t q3s16, 363 int16x8_t q6s16, 364 int16x8_t q7s16, 365 int16x8_t q8s16, 366 int16x8_t q9s16, 367 int16x8_t q10s16, 368 int16x8_t q11s16, 369 int16x8_t q12s16, 370 int16x8_t q13s16, 371 int16x8_t q14s16, 372 int16x8_t q15s16) { 373 uint8_t *r6 = dest + 31 * stride; 374 uint8_t *r7 = dest/* + 0 * stride*/; 375 uint8_t *r9 = dest + 15 * stride; 376 uint8_t *r10 = dest + 16 * stride; 377 int str2 = stride << 1; 378 int16x8_t q0s16, q1s16, q4s16, q5s16; 379 380 STORE_COMBINE_CENTER_RESULTS(r10, r9); 381 r10 += str2; r9 -= str2; 382 383 LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) 384 q4s16 = vaddq_s16(q2s16, q1s16); 385 q5s16 = vaddq_s16(q3s16, q0s16); 386 q6s16 = vsubq_s16(q3s16, q0s16); 387 q7s16 = vsubq_s16(q2s16, q1s16); 388 STORE_COMBINE_EXTREME_RESULTS(r7, r6); 389 r7 += str2; r6 -= str2; 390 391 LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) 392 q2s16 = vaddq_s16(q10s16, q1s16); 393 q3s16 = vaddq_s16(q11s16, q0s16); 394 q4s16 = vsubq_s16(q11s16, q0s16); 395 q5s16 = vsubq_s16(q10s16, q1s16); 396 397 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) 398 q8s16 = vaddq_s16(q4s16, q1s16); 399 q9s16 = vaddq_s16(q5s16, q0s16); 400 q6s16 = vsubq_s16(q5s16, q0s16); 401 q7s16 = vsubq_s16(q4s16, q1s16); 402 STORE_COMBINE_CENTER_RESULTS(r10, r9); 403 r10 += str2; r9 -= str2; 404 405 LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) 406 q4s16 = vaddq_s16(q2s16, q1s16); 407 q5s16 = vaddq_s16(q3s16, q0s16); 408 q6s16 = vsubq_s16(q3s16, q0s16); 409 q7s16 = vsubq_s16(q2s16, q1s16); 410 STORE_COMBINE_EXTREME_RESULTS(r7, r6); 411 r7 += str2; r6 -= str2; 412 413 LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) 414 q2s16 = vaddq_s16(q12s16, q1s16); 415 q3s16 = vaddq_s16(q13s16, q0s16); 416 q4s16 = vsubq_s16(q13s16, q0s16); 417 q5s16 = vsubq_s16(q12s16, q1s16); 418 419 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) 420 q8s16 = vaddq_s16(q4s16, q1s16); 421 q9s16 = vaddq_s16(q5s16, q0s16); 422 q6s16 = vsubq_s16(q5s16, q0s16); 423 q7s16 = vsubq_s16(q4s16, q1s16); 424 STORE_COMBINE_CENTER_RESULTS(r10, r9); 425 r10 += str2; r9 -= str2; 426 427 LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) 428 q4s16 = vaddq_s16(q2s16, q1s16); 429 q5s16 = vaddq_s16(q3s16, q0s16); 430 q6s16 = vsubq_s16(q3s16, q0s16); 431 q7s16 = vsubq_s16(q2s16, q1s16); 432 STORE_COMBINE_EXTREME_RESULTS(r7, r6); 433 r7 += str2; r6 -= str2; 434 435 LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) 436 q2s16 = vaddq_s16(q14s16, q1s16); 437 q3s16 = vaddq_s16(q15s16, q0s16); 438 q4s16 = vsubq_s16(q15s16, q0s16); 439 q5s16 = vsubq_s16(q14s16, q1s16); 440 441 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) 442 q8s16 = vaddq_s16(q4s16, q1s16); 443 q9s16 = vaddq_s16(q5s16, q0s16); 444 q6s16 = vsubq_s16(q5s16, q0s16); 445 q7s16 = vsubq_s16(q4s16, q1s16); 446 STORE_COMBINE_CENTER_RESULTS(r10, r9); 447 448 LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) 449 q4s16 = vaddq_s16(q2s16, q1s16); 450 q5s16 = vaddq_s16(q3s16, q0s16); 451 q6s16 = vsubq_s16(q3s16, q0s16); 452 q7s16 = vsubq_s16(q2s16, q1s16); 453 STORE_COMBINE_EXTREME_RESULTS(r7, r6); 454 return; 455} 456 457void vpx_idct32x32_1024_add_neon( 458 int16_t *input, 459 uint8_t *dest, 460 int stride) { 461 int i, idct32_pass_loop; 462 int16_t trans_buf[32 * 8]; 463 int16_t pass1[32 * 32]; 464 int16_t pass2[32 * 32]; 465 int16_t *out; 466 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; 467 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; 468 469 for (idct32_pass_loop = 0, out = pass1; 470 idct32_pass_loop < 2; 471 idct32_pass_loop++, 472 input = pass1, // the input of pass2 is the result of pass1 473 out = pass2) { 474 for (i = 0; 475 i < 4; i++, 476 input += 32 * 8, out += 8) { // idct32_bands_loop 477 idct32_transpose_pair(input, trans_buf); 478 479 // ----------------------------------------- 480 // BLOCK A: 16-19,28-31 481 // ----------------------------------------- 482 // generate 16,17,30,31 483 // part of stage 1 484 LOAD_FROM_TRANSPOSED(0, 1, 31) 485 DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) 486 LOAD_FROM_TRANSPOSED(31, 17, 15) 487 DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) 488 // part of stage 2 489 q4s16 = vaddq_s16(q0s16, q1s16); 490 q13s16 = vsubq_s16(q0s16, q1s16); 491 q6s16 = vaddq_s16(q2s16, q3s16); 492 q14s16 = vsubq_s16(q2s16, q3s16); 493 // part of stage 3 494 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) 495 496 // generate 18,19,28,29 497 // part of stage 1 498 LOAD_FROM_TRANSPOSED(15, 9, 23) 499 DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) 500 LOAD_FROM_TRANSPOSED(23, 25, 7) 501 DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) 502 // part of stage 2 503 q13s16 = vsubq_s16(q3s16, q2s16); 504 q3s16 = vaddq_s16(q3s16, q2s16); 505 q14s16 = vsubq_s16(q1s16, q0s16); 506 q2s16 = vaddq_s16(q1s16, q0s16); 507 // part of stage 3 508 DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) 509 // part of stage 4 510 q8s16 = vaddq_s16(q4s16, q2s16); 511 q9s16 = vaddq_s16(q5s16, q0s16); 512 q10s16 = vaddq_s16(q7s16, q1s16); 513 q15s16 = vaddq_s16(q6s16, q3s16); 514 q13s16 = vsubq_s16(q5s16, q0s16); 515 q14s16 = vsubq_s16(q7s16, q1s16); 516 STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) 517 STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) 518 // part of stage 5 519 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) 520 STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) 521 // part of stage 4 522 q13s16 = vsubq_s16(q4s16, q2s16); 523 q14s16 = vsubq_s16(q6s16, q3s16); 524 // part of stage 5 525 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) 526 STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) 527 528 // ----------------------------------------- 529 // BLOCK B: 20-23,24-27 530 // ----------------------------------------- 531 // generate 20,21,26,27 532 // part of stage 1 533 LOAD_FROM_TRANSPOSED(7, 5, 27) 534 DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) 535 LOAD_FROM_TRANSPOSED(27, 21, 11) 536 DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) 537 // part of stage 2 538 q13s16 = vsubq_s16(q0s16, q1s16); 539 q0s16 = vaddq_s16(q0s16, q1s16); 540 q14s16 = vsubq_s16(q2s16, q3s16); 541 q2s16 = vaddq_s16(q2s16, q3s16); 542 // part of stage 3 543 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) 544 545 // generate 22,23,24,25 546 // part of stage 1 547 LOAD_FROM_TRANSPOSED(11, 13, 19) 548 DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) 549 LOAD_FROM_TRANSPOSED(19, 29, 3) 550 DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) 551 // part of stage 2 552 q14s16 = vsubq_s16(q4s16, q5s16); 553 q5s16 = vaddq_s16(q4s16, q5s16); 554 q13s16 = vsubq_s16(q6s16, q7s16); 555 q6s16 = vaddq_s16(q6s16, q7s16); 556 // part of stage 3 557 DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) 558 // part of stage 4 559 q10s16 = vaddq_s16(q7s16, q1s16); 560 q11s16 = vaddq_s16(q5s16, q0s16); 561 q12s16 = vaddq_s16(q6s16, q2s16); 562 q15s16 = vaddq_s16(q4s16, q3s16); 563 // part of stage 6 564 LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) 565 q8s16 = vaddq_s16(q14s16, q11s16); 566 q9s16 = vaddq_s16(q13s16, q10s16); 567 q13s16 = vsubq_s16(q13s16, q10s16); 568 q11s16 = vsubq_s16(q14s16, q11s16); 569 STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) 570 LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) 571 q8s16 = vsubq_s16(q9s16, q12s16); 572 q10s16 = vaddq_s16(q14s16, q15s16); 573 q14s16 = vsubq_s16(q14s16, q15s16); 574 q12s16 = vaddq_s16(q9s16, q12s16); 575 STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) 576 // part of stage 7 577 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) 578 STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) 579 q13s16 = q11s16; 580 q14s16 = q8s16; 581 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) 582 STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) 583 // part of stage 4 584 q14s16 = vsubq_s16(q5s16, q0s16); 585 q13s16 = vsubq_s16(q6s16, q2s16); 586 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); 587 q14s16 = vsubq_s16(q7s16, q1s16); 588 q13s16 = vsubq_s16(q4s16, q3s16); 589 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); 590 // part of stage 6 591 LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) 592 q8s16 = vaddq_s16(q14s16, q1s16); 593 q9s16 = vaddq_s16(q13s16, q6s16); 594 q13s16 = vsubq_s16(q13s16, q6s16); 595 q1s16 = vsubq_s16(q14s16, q1s16); 596 STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) 597 LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) 598 q14s16 = vsubq_s16(q8s16, q5s16); 599 q10s16 = vaddq_s16(q8s16, q5s16); 600 q11s16 = vaddq_s16(q9s16, q0s16); 601 q0s16 = vsubq_s16(q9s16, q0s16); 602 STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) 603 // part of stage 7 604 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) 605 STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) 606 DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, 607 &q1s16, &q0s16); 608 STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) 609 610 // ----------------------------------------- 611 // BLOCK C: 8-10,11-15 612 // ----------------------------------------- 613 // generate 8,9,14,15 614 // part of stage 2 615 LOAD_FROM_TRANSPOSED(3, 2, 30) 616 DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) 617 LOAD_FROM_TRANSPOSED(30, 18, 14) 618 DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) 619 // part of stage 3 620 q13s16 = vsubq_s16(q0s16, q1s16); 621 q0s16 = vaddq_s16(q0s16, q1s16); 622 q14s16 = vsubq_s16(q2s16, q3s16); 623 q2s16 = vaddq_s16(q2s16, q3s16); 624 // part of stage 4 625 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) 626 627 // generate 10,11,12,13 628 // part of stage 2 629 LOAD_FROM_TRANSPOSED(14, 10, 22) 630 DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) 631 LOAD_FROM_TRANSPOSED(22, 26, 6) 632 DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) 633 // part of stage 3 634 q14s16 = vsubq_s16(q4s16, q5s16); 635 q5s16 = vaddq_s16(q4s16, q5s16); 636 q13s16 = vsubq_s16(q6s16, q7s16); 637 q6s16 = vaddq_s16(q6s16, q7s16); 638 // part of stage 4 639 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) 640 // part of stage 5 641 q8s16 = vaddq_s16(q0s16, q5s16); 642 q9s16 = vaddq_s16(q1s16, q7s16); 643 q13s16 = vsubq_s16(q1s16, q7s16); 644 q14s16 = vsubq_s16(q3s16, q4s16); 645 q10s16 = vaddq_s16(q3s16, q4s16); 646 q15s16 = vaddq_s16(q2s16, q6s16); 647 STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) 648 STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) 649 // part of stage 6 650 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) 651 STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) 652 q13s16 = vsubq_s16(q0s16, q5s16); 653 q14s16 = vsubq_s16(q2s16, q6s16); 654 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) 655 STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) 656 657 // ----------------------------------------- 658 // BLOCK D: 0-3,4-7 659 // ----------------------------------------- 660 // generate 4,5,6,7 661 // part of stage 3 662 LOAD_FROM_TRANSPOSED(6, 4, 28) 663 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) 664 LOAD_FROM_TRANSPOSED(28, 20, 12) 665 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) 666 // part of stage 4 667 q13s16 = vsubq_s16(q0s16, q1s16); 668 q0s16 = vaddq_s16(q0s16, q1s16); 669 q14s16 = vsubq_s16(q2s16, q3s16); 670 q2s16 = vaddq_s16(q2s16, q3s16); 671 // part of stage 5 672 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) 673 674 // generate 0,1,2,3 675 // part of stage 4 676 LOAD_FROM_TRANSPOSED(12, 0, 16) 677 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) 678 LOAD_FROM_TRANSPOSED(16, 8, 24) 679 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) 680 // part of stage 5 681 q4s16 = vaddq_s16(q7s16, q6s16); 682 q7s16 = vsubq_s16(q7s16, q6s16); 683 q6s16 = vsubq_s16(q5s16, q14s16); 684 q5s16 = vaddq_s16(q5s16, q14s16); 685 // part of stage 6 686 q8s16 = vaddq_s16(q4s16, q2s16); 687 q9s16 = vaddq_s16(q5s16, q3s16); 688 q10s16 = vaddq_s16(q6s16, q1s16); 689 q11s16 = vaddq_s16(q7s16, q0s16); 690 q12s16 = vsubq_s16(q7s16, q0s16); 691 q13s16 = vsubq_s16(q6s16, q1s16); 692 q14s16 = vsubq_s16(q5s16, q3s16); 693 q15s16 = vsubq_s16(q4s16, q2s16); 694 // part of stage 7 695 LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) 696 q2s16 = vaddq_s16(q8s16, q1s16); 697 q3s16 = vaddq_s16(q9s16, q0s16); 698 q4s16 = vsubq_s16(q9s16, q0s16); 699 q5s16 = vsubq_s16(q8s16, q1s16); 700 LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) 701 q8s16 = vaddq_s16(q4s16, q1s16); 702 q9s16 = vaddq_s16(q5s16, q0s16); 703 q6s16 = vsubq_s16(q5s16, q0s16); 704 q7s16 = vsubq_s16(q4s16, q1s16); 705 706 if (idct32_pass_loop == 0) { 707 idct32_bands_end_1st_pass(out, 708 q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, 709 q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); 710 } else { 711 idct32_bands_end_2nd_pass(out, dest, stride, 712 q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, 713 q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); 714 dest += 8; 715 } 716 } 717 } 718 return; 719} 720