1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan EXPORT |vp9_iht8x8_64_add_neon| 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan ARM 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan REQUIRE8 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan PRESERVE8 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan AREA ||.text||, CODE, READONLY, ALIGN=2 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Generate IADST constants in r0 - r12 for the IADST. 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan MACRO 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan GENERATE_IADST_CONSTANTS 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_2_64 = 16305 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r0, #0x3f00 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r0, #0xb1 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_30_64 = 1606 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r1, #0x600 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r1, #0x46 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_10_64 = 14449 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r2, #0x3800 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r2, #0x71 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_22_64 = 7723 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r3, #0x1e00 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r3, #0x2b 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_18_64 = 10394 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r4, #0x2800 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r4, #0x9a 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_14_64 = 12665 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r5, #0x3100 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r5, #0x79 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_26_64 = 4756 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r6, #0x1200 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r6, #0x94 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_6_64 = 15679 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r7, #0x3d00 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r7, #0x3f 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_8_64 = 15137 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r8, #0x3b00 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r8, #0x21 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_24_64 = 6270 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r9, #0x1800 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r9, #0x7e 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate 0 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r10, #0 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_16_64 = 11585 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r12, #0x2d00 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r12, #0x41 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan MEND 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Generate IDCT constants in r3 - r9 for the IDCT. 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan MACRO 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan GENERATE_IDCT_CONSTANTS 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_28_64 = 3196 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r3, #0x0c00 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r3, #0x7c 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_4_64 = 16069 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r4, #0x3e00 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r4, #0xc5 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_12_64 = 13623 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r5, #0x3500 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r5, #0x37 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_20_64 = 9102 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r6, #0x2300 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r6, #0x8e 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_16_64 = 11585 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r7, #0x2d00 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r7, #0x41 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_24_64 = 6270 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r8, #0x1800 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r8, #0x7e 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate cospi_8_64 = 15137 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r9, #0x3b00 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r9, #0x21 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan MEND 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15. 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan MACRO 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan TRANSPOSE8X8 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d17, d24 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d23, d30 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d21, d28 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d19, d26 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 q8, q10 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 q9, q11 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 q12, q14 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 q13, q15 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 q8, q9 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 q10, q11 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 q12, q13 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 q14, q15 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan MEND 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; will be stored back into q8-q15 registers. This macro will touch q0-q7 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; registers and use them as buffer during calculation. 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan MACRO 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan IDCT8x8_1D 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; stage 1 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d0, r3 ; duplicate cospi_28_64 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d1, r4 ; duplicate cospi_4_64 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d2, r5 ; duplicate cospi_12_64 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d3, r6 ; duplicate cospi_20_64 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[1] * cospi_28_64 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q2, d18, d0 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q3, d19, d0 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[5] * cospi_12_64 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q5, d26, d2 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q6, d27, d2 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[1]*cospi_28_64-input[7]*cospi_4_64 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q2, d30, d1 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q3, d31, d1 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[5] * cospi_12_64 - input[3] * cospi_20_64 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q5, d22, d3 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q6, d23, d3 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dct_const_round_shift(input_dc * cospi_16_64) 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d8, q2, #14 ; >> 14 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d9, q3, #14 ; >> 14 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dct_const_round_shift(input_dc * cospi_16_64) 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d10, q5, #14 ; >> 14 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d11, q6, #14 ; >> 14 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[1] * cospi_4_64 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q2, d18, d1 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q3, d19, d1 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[5] * cospi_20_64 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q9, d26, d3 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q13, d27, d3 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[1]*cospi_4_64+input[7]*cospi_28_64 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q2, d30, d0 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q3, d31, d0 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[5] * cospi_20_64 + input[3] * cospi_12_64 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q9, d22, d2 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q13, d23, d2 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dct_const_round_shift(input_dc * cospi_16_64) 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d14, q2, #14 ; >> 14 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d15, q3, #14 ; >> 14 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; stage 2 & stage 3 - even half 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d0, r7 ; duplicate cospi_16_64 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dct_const_round_shift(input_dc * cospi_16_64) 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d12, q9, #14 ; >> 14 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d13, q13, #14 ; >> 14 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[0] * cospi_16_64 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q2, d16, d0 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q3, d17, d0 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[0] * cospi_16_64 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q13, d16, d0 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q15, d17, d0 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (input[0] + input[2]) * cospi_16_64 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q2, d24, d0 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q3, d25, d0 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (input[0] - input[2]) * cospi_16_64 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q13, d24, d0 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q15, d25, d0 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d0, r8 ; duplicate cospi_24_64 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d1, r9 ; duplicate cospi_8_64 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dct_const_round_shift(input_dc * cospi_16_64) 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d18, q2, #14 ; >> 14 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d19, q3, #14 ; >> 14 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dct_const_round_shift(input_dc * cospi_16_64) 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d22, q13, #14 ; >> 14 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d23, q15, #14 ; >> 14 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[1] * cospi_24_64 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q2, d20, d0 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q3, d21, d0 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[1] * cospi_8_64 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q8, d20, d1 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q12, d21, d1 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[1] * cospi_24_64 - input[3] * cospi_8_64 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q2, d28, d1 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q3, d29, d1 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; input[1] * cospi_8_64 + input[3] * cospi_24_64 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q8, d28, d0 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q12, d29, d0 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dct_const_round_shift(input_dc * cospi_16_64) 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d26, q2, #14 ; >> 14 226233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d27, q3, #14 ; >> 14 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dct_const_round_shift(input_dc * cospi_16_64) 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d30, q8, #14 ; >> 14 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d31, q12, #14 ; >> 14 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; stage 3 -odd half 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d16, r7 ; duplicate cospi_16_64 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; stage 2 - odd half 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; step2[6] * cospi_16_64 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q9, d28, d16 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q10, d29, d16 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; step2[6] * cospi_16_64 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q11, d28, d16 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q12, d29, d16 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (step2[6] - step2[5]) * cospi_16_64 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q9, d26, d16 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q10, d27, d16 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (step2[5] + step2[6]) * cospi_16_64 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q11, d26, d16 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q12, d27, d16 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dct_const_round_shift(input_dc * cospi_16_64) 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d10, q9, #14 ; >> 14 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d11, q10, #14 ; >> 14 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dct_const_round_shift(input_dc * cospi_16_64) 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d12, q11, #14 ; >> 14 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d13, q12, #14 ; >> 14 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; stage 4 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan MEND 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; output will be stored back into q8-q15 registers. This macro will touch 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; q0 - q7 registers and use them as buffer during calculation. 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan MACRO 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan IADST8X8_1D 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d14, r0 ; duplicate cospi_2_64 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d15, r1 ; duplicate cospi_30_64 289233d2500723e5594f3e7c70896ffeeef32b9c950ywan 290233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_2_64 * x0 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q1, d30, d14 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q2, d31, d14 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_30_64 * x0 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q3, d30, d15 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q4, d31, d15 297233d2500723e5594f3e7c70896ffeeef32b9c950ywan 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d30, r4 ; duplicate cospi_18_64 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d31, r5 ; duplicate cospi_14_64 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q1, d16, d15 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q2, d17, d15 304233d2500723e5594f3e7c70896ffeeef32b9c950ywan 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s1 = cospi_30_64 * x0 - cospi_2_64 * x1 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q3, d16, d14 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q4, d17, d14 308233d2500723e5594f3e7c70896ffeeef32b9c950ywan 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_18_64 * x4 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q5, d22, d30 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q6, d23, d30 312233d2500723e5594f3e7c70896ffeeef32b9c950ywan 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_14_64 * x4 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q7, d22, d31 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q8, d23, d31 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q5, d24, d31 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q6, d25, d31 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan 321233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q7, d24, d30 323233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q8, d25, d30 324233d2500723e5594f3e7c70896ffeeef32b9c950ywan 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s0 + s4) 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q11, q1, q5 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q12, q2, q6 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d0, r2 ; duplicate cospi_10_64 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d1, r3 ; duplicate cospi_22_64 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s0 - s4) 333233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q1, q1, q5 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q2, q2, q6 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x0 = dct_const_round_shift(s0 + s4); 337233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d22, q11, #14 ; >> 14 338233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d23, q12, #14 ; >> 14 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s1 + s5) 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q12, q3, q7 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q15, q4, q8 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s1 - s5) 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q3, q3, q7 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q4, q4, q8 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x4 = dct_const_round_shift(s0 - s4); 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d2, q1, #14 ; >> 14 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d3, q2, #14 ; >> 14 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x1 = dct_const_round_shift(s1 + s5); 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d24, q12, #14 ; >> 14 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d25, q15, #14 ; >> 14 355233d2500723e5594f3e7c70896ffeeef32b9c950ywan 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x5 = dct_const_round_shift(s1 - s5); 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d6, q3, #14 ; >> 14 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d7, q4, #14 ; >> 14 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_10_64 * x2 361233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q4, d26, d0 362233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q5, d27, d0 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_22_64 * x2 365233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q2, d26, d1 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q6, d27, d1 367233d2500723e5594f3e7c70896ffeeef32b9c950ywan 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d30, r6 ; duplicate cospi_26_64 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d31, r7 ; duplicate cospi_6_64 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q4, d20, d1 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q5, d21, d1 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan 375233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q2, d20, d0 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q6, d21, d0 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_26_64 * x6 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q0, d18, d30 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q13, d19, d30 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q0, d28, d31 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q13, d29, d31 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_6_64 * x6 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q10, d18, d31 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q9, d19, d31 390233d2500723e5594f3e7c70896ffeeef32b9c950ywan 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q10, d28, d30 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q9, d29, d30 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s3 + s7) 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q14, q2, q10 397233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q15, q6, q9 398233d2500723e5594f3e7c70896ffeeef32b9c950ywan 399233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s3 - s7) 400233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q2, q2, q10 401233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q6, q6, q9 402233d2500723e5594f3e7c70896ffeeef32b9c950ywan 403233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x3 = dct_const_round_shift(s3 + s7); 404233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d28, q14, #14 ; >> 14 405233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d29, q15, #14 ; >> 14 406233d2500723e5594f3e7c70896ffeeef32b9c950ywan 407233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x7 = dct_const_round_shift(s3 - s7); 408233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d4, q2, #14 ; >> 14 409233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d5, q6, #14 ; >> 14 410233d2500723e5594f3e7c70896ffeeef32b9c950ywan 411233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s2 + s6) 412233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q9, q4, q0 413233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q10, q5, q13 414233d2500723e5594f3e7c70896ffeeef32b9c950ywan 415233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s2 - s6) 416233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q4, q4, q0 417233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q5, q5, q13 418233d2500723e5594f3e7c70896ffeeef32b9c950ywan 419233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d30, r8 ; duplicate cospi_8_64 420233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d31, r9 ; duplicate cospi_24_64 421233d2500723e5594f3e7c70896ffeeef32b9c950ywan 422233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x2 = dct_const_round_shift(s2 + s6); 423233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d18, q9, #14 ; >> 14 424233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d19, q10, #14 ; >> 14 425233d2500723e5594f3e7c70896ffeeef32b9c950ywan 426233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x6 = dct_const_round_shift(s2 - s6); 427233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d8, q4, #14 ; >> 14 428233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d9, q5, #14 ; >> 14 429233d2500723e5594f3e7c70896ffeeef32b9c950ywan 430233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_8_64 * x4 431233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q5, d2, d30 432233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q6, d3, d30 433233d2500723e5594f3e7c70896ffeeef32b9c950ywan 434233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_24_64 * x4 435233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q7, d2, d31 436233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q0, d3, d31 437233d2500723e5594f3e7c70896ffeeef32b9c950ywan 438233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 439233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q5, d6, d31 440233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q6, d7, d31 441233d2500723e5594f3e7c70896ffeeef32b9c950ywan 442233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 443233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q7, d6, d30 444233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q0, d7, d30 445233d2500723e5594f3e7c70896ffeeef32b9c950ywan 446233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_8_64 * x7 447233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q1, d4, d30 448233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q3, d5, d30 449233d2500723e5594f3e7c70896ffeeef32b9c950ywan 450233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_24_64 * x7 451233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q10, d4, d31 452233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q2, d5, d31 453233d2500723e5594f3e7c70896ffeeef32b9c950ywan 454233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 455233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q1, d8, d31 456233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q3, d9, d31 457233d2500723e5594f3e7c70896ffeeef32b9c950ywan 458233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 459233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q10, d8, d30 460233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q2, d9, d30 461233d2500723e5594f3e7c70896ffeeef32b9c950ywan 462233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s16 q8, q11, q9 ; x0 = s0 + s2; 463233d2500723e5594f3e7c70896ffeeef32b9c950ywan 464233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q11, q11, q9 ; x2 = s0 - s2; 465233d2500723e5594f3e7c70896ffeeef32b9c950ywan 466233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s16 q4, q12, q14 ; x1 = s1 + s3; 467233d2500723e5594f3e7c70896ffeeef32b9c950ywan 468233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q12, q12, q14 ; x3 = s1 - s3; 469233d2500723e5594f3e7c70896ffeeef32b9c950ywan 470233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s4 + s6) 471233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q14, q5, q1 472233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q15, q6, q3 473233d2500723e5594f3e7c70896ffeeef32b9c950ywan 474233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s4 - s6) 475233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q5, q5, q1 476233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q6, q6, q3 477233d2500723e5594f3e7c70896ffeeef32b9c950ywan 478233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x4 = dct_const_round_shift(s4 + s6); 479233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d18, q14, #14 ; >> 14 480233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d19, q15, #14 ; >> 14 481233d2500723e5594f3e7c70896ffeeef32b9c950ywan 482233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x6 = dct_const_round_shift(s4 - s6); 483233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d10, q5, #14 ; >> 14 484233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d11, q6, #14 ; >> 14 485233d2500723e5594f3e7c70896ffeeef32b9c950ywan 486233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s5 + s7) 487233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q1, q7, q10 488233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s32 q3, q0, q2 489233d2500723e5594f3e7c70896ffeeef32b9c950ywan 490233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (s5 - s7)) 491233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q7, q7, q10 492233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s32 q0, q0, q2 493233d2500723e5594f3e7c70896ffeeef32b9c950ywan 494233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x5 = dct_const_round_shift(s5 + s7); 495233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d28, q1, #14 ; >> 14 496233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d29, q3, #14 ; >> 14 497233d2500723e5594f3e7c70896ffeeef32b9c950ywan 498233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x7 = dct_const_round_shift(s5 - s7); 499233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d14, q7, #14 ; >> 14 500233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d15, q0, #14 ; >> 14 501233d2500723e5594f3e7c70896ffeeef32b9c950ywan 502233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 d30, r12 ; duplicate cospi_16_64 503233d2500723e5594f3e7c70896ffeeef32b9c950ywan 504233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_16_64 * x2 505233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q2, d22, d30 506233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q3, d23, d30 507233d2500723e5594f3e7c70896ffeeef32b9c950ywan 508233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_6_64 * x6 509233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q13, d22, d30 510233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q1, d23, d30 511233d2500723e5594f3e7c70896ffeeef32b9c950ywan 512233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_16_64 * x2 + cospi_16_64 * x3; 513233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q2, d24, d30 514233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q3, d25, d30 515233d2500723e5594f3e7c70896ffeeef32b9c950ywan 516233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_16_64 * x2 - cospi_16_64 * x3; 517233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q13, d24, d30 518233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q1, d25, d30 519233d2500723e5594f3e7c70896ffeeef32b9c950ywan 520233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x2 = dct_const_round_shift(s2); 521233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d4, q2, #14 ; >> 14 522233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d5, q3, #14 ; >> 14 523233d2500723e5594f3e7c70896ffeeef32b9c950ywan 524233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;x3 = dct_const_round_shift(s3); 525233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d24, q13, #14 ; >> 14 526233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d25, q1, #14 ; >> 14 527233d2500723e5594f3e7c70896ffeeef32b9c950ywan 528233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_16_64 * x6 529233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q13, d10, d30 530233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q1, d11, d30 531233d2500723e5594f3e7c70896ffeeef32b9c950ywan 532233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_6_64 * x6 533233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q11, d10, d30 534233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s16 q0, d11, d30 535233d2500723e5594f3e7c70896ffeeef32b9c950ywan 536233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_16_64 * x6 + cospi_16_64 * x7; 537233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q13, d14, d30 538233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q1, d15, d30 539233d2500723e5594f3e7c70896ffeeef32b9c950ywan 540233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; cospi_16_64 * x6 - cospi_16_64 * x7; 541233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q11, d14, d30 542233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.s16 q0, d15, d30 543233d2500723e5594f3e7c70896ffeeef32b9c950ywan 544233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; x6 = dct_const_round_shift(s6); 545233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d20, q13, #14 ; >> 14 546233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d21, q1, #14 ; >> 14 547233d2500723e5594f3e7c70896ffeeef32b9c950ywan 548233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;x7 = dct_const_round_shift(s7); 549233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d12, q11, #14 ; >> 14 550233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.s32 d13, q0, #14 ; >> 14 551233d2500723e5594f3e7c70896ffeeef32b9c950ywan 552233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.16 q5, r10 ; duplicate 0 553233d2500723e5594f3e7c70896ffeeef32b9c950ywan 554233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q9, q5, q9 ; output[1] = -x4; 555233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q11, q5, q2 ; output[3] = -x2; 556233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q13, q5, q6 ; output[5] = -x7; 557233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s16 q15, q5, q4 ; output[7] = -x1; 558233d2500723e5594f3e7c70896ffeeef32b9c950ywan MEND 559233d2500723e5594f3e7c70896ffeeef32b9c950ywan 560233d2500723e5594f3e7c70896ffeeef32b9c950ywan 561233d2500723e5594f3e7c70896ffeeef32b9c950ywan AREA Block, CODE, READONLY ; name this block of code 562233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest, 563233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int dest_stride, int tx_type) 564233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 565233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0 int16_t input 566233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1 uint8_t *dest 567233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2 int dest_stride 568233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3 int tx_type) 569233d2500723e5594f3e7c70896ffeeef32b9c950ywan; This function will only handle tx_type of 1,2,3. 570233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_iht8x8_64_add_neon| PROC 571233d2500723e5594f3e7c70896ffeeef32b9c950ywan 572233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; load the inputs into d16-d19 573233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.s16 {q8,q9}, [r0]! 574233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.s16 {q10,q11}, [r0]! 575233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.s16 {q12,q13}, [r0]! 576233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.s16 {q14,q15}, [r0]! 577233d2500723e5594f3e7c70896ffeeef32b9c950ywan 578233d2500723e5594f3e7c70896ffeeef32b9c950ywan push {r0-r10} 579233d2500723e5594f3e7c70896ffeeef32b9c950ywan vpush {d8-d15} 580233d2500723e5594f3e7c70896ffeeef32b9c950ywan 581233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose the input data 582233d2500723e5594f3e7c70896ffeeef32b9c950ywan TRANSPOSE8X8 583233d2500723e5594f3e7c70896ffeeef32b9c950ywan 584233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; decide the type of transform 585233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r3, #2 586233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq idct_iadst 587233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r3, #3 588233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq iadst_iadst 589233d2500723e5594f3e7c70896ffeeef32b9c950ywan 590233d2500723e5594f3e7c70896ffeeef32b9c950ywaniadst_idct 591233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate IDCT constants 592233d2500723e5594f3e7c70896ffeeef32b9c950ywan GENERATE_IDCT_CONSTANTS 593233d2500723e5594f3e7c70896ffeeef32b9c950ywan 594233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; first transform rows 595233d2500723e5594f3e7c70896ffeeef32b9c950ywan IDCT8x8_1D 596233d2500723e5594f3e7c70896ffeeef32b9c950ywan 597233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose the matrix 598233d2500723e5594f3e7c70896ffeeef32b9c950ywan TRANSPOSE8X8 599233d2500723e5594f3e7c70896ffeeef32b9c950ywan 600233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate IADST constants 601233d2500723e5594f3e7c70896ffeeef32b9c950ywan GENERATE_IADST_CONSTANTS 602233d2500723e5594f3e7c70896ffeeef32b9c950ywan 603233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; then transform columns 604233d2500723e5594f3e7c70896ffeeef32b9c950ywan IADST8X8_1D 605233d2500723e5594f3e7c70896ffeeef32b9c950ywan 606233d2500723e5594f3e7c70896ffeeef32b9c950ywan b end_vp9_iht8x8_64_add_neon 607233d2500723e5594f3e7c70896ffeeef32b9c950ywan 608233d2500723e5594f3e7c70896ffeeef32b9c950ywanidct_iadst 609233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate IADST constants 610233d2500723e5594f3e7c70896ffeeef32b9c950ywan GENERATE_IADST_CONSTANTS 611233d2500723e5594f3e7c70896ffeeef32b9c950ywan 612233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; first transform rows 613233d2500723e5594f3e7c70896ffeeef32b9c950ywan IADST8X8_1D 614233d2500723e5594f3e7c70896ffeeef32b9c950ywan 615233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose the matrix 616233d2500723e5594f3e7c70896ffeeef32b9c950ywan TRANSPOSE8X8 617233d2500723e5594f3e7c70896ffeeef32b9c950ywan 618233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate IDCT constants 619233d2500723e5594f3e7c70896ffeeef32b9c950ywan GENERATE_IDCT_CONSTANTS 620233d2500723e5594f3e7c70896ffeeef32b9c950ywan 621233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; then transform columns 622233d2500723e5594f3e7c70896ffeeef32b9c950ywan IDCT8x8_1D 623233d2500723e5594f3e7c70896ffeeef32b9c950ywan 624233d2500723e5594f3e7c70896ffeeef32b9c950ywan b end_vp9_iht8x8_64_add_neon 625233d2500723e5594f3e7c70896ffeeef32b9c950ywan 626233d2500723e5594f3e7c70896ffeeef32b9c950ywaniadst_iadst 627233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; generate IADST constants 628233d2500723e5594f3e7c70896ffeeef32b9c950ywan GENERATE_IADST_CONSTANTS 629233d2500723e5594f3e7c70896ffeeef32b9c950ywan 630233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; first transform rows 631233d2500723e5594f3e7c70896ffeeef32b9c950ywan IADST8X8_1D 632233d2500723e5594f3e7c70896ffeeef32b9c950ywan 633233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose the matrix 634233d2500723e5594f3e7c70896ffeeef32b9c950ywan TRANSPOSE8X8 635233d2500723e5594f3e7c70896ffeeef32b9c950ywan 636233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; then transform columns 637233d2500723e5594f3e7c70896ffeeef32b9c950ywan IADST8X8_1D 638233d2500723e5594f3e7c70896ffeeef32b9c950ywan 639233d2500723e5594f3e7c70896ffeeef32b9c950ywanend_vp9_iht8x8_64_add_neon 640233d2500723e5594f3e7c70896ffeeef32b9c950ywan vpop {d8-d15} 641233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {r0-r10} 642233d2500723e5594f3e7c70896ffeeef32b9c950ywan 643233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; ROUND_POWER_OF_TWO(temp_out[j], 5) 644233d2500723e5594f3e7c70896ffeeef32b9c950ywan vrshr.s16 q8, q8, #5 645233d2500723e5594f3e7c70896ffeeef32b9c950ywan vrshr.s16 q9, q9, #5 646233d2500723e5594f3e7c70896ffeeef32b9c950ywan vrshr.s16 q10, q10, #5 647233d2500723e5594f3e7c70896ffeeef32b9c950ywan vrshr.s16 q11, q11, #5 648233d2500723e5594f3e7c70896ffeeef32b9c950ywan vrshr.s16 q12, q12, #5 649233d2500723e5594f3e7c70896ffeeef32b9c950ywan vrshr.s16 q13, q13, #5 650233d2500723e5594f3e7c70896ffeeef32b9c950ywan vrshr.s16 q14, q14, #5 651233d2500723e5594f3e7c70896ffeeef32b9c950ywan vrshr.s16 q15, q15, #5 652233d2500723e5594f3e7c70896ffeeef32b9c950ywan 653233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; save dest pointer 654233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r0, r1 655233d2500723e5594f3e7c70896ffeeef32b9c950ywan 656233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; load destination data 657233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.64 {d0}, [r1], r2 658233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.64 {d1}, [r1], r2 659233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.64 {d2}, [r1], r2 660233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.64 {d3}, [r1], r2 661233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.64 {d4}, [r1], r2 662233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.64 {d5}, [r1], r2 663233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.64 {d6}, [r1], r2 664233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.64 {d7}, [r1] 665233d2500723e5594f3e7c70896ffeeef32b9c950ywan 666233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] 667233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q8, q8, d0 668233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q9, q9, d1 669233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q10, q10, d2 670233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q11, q11, d3 671233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q12, q12, d4 672233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q13, q13, d5 673233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, q14, d6 674233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q15, q15, d7 675233d2500723e5594f3e7c70896ffeeef32b9c950ywan 676233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; clip_pixel 677233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqmovun.s16 d0, q8 678233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqmovun.s16 d1, q9 679233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqmovun.s16 d2, q10 680233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqmovun.s16 d3, q11 681233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqmovun.s16 d4, q12 682233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqmovun.s16 d5, q13 683233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqmovun.s16 d6, q14 684233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqmovun.s16 d7, q15 685233d2500723e5594f3e7c70896ffeeef32b9c950ywan 686233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; store the data 687233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.64 {d0}, [r0], r2 688233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.64 {d1}, [r0], r2 689233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.64 {d2}, [r0], r2 690233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.64 {d3}, [r0], r2 691233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.64 {d4}, [r0], r2 692233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.64 {d5}, [r0], r2 693233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.64 {d6}, [r0], r2 694233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.64 {d7}, [r0], r2 695233d2500723e5594f3e7c70896ffeeef32b9c950ywan bx lr 696233d2500723e5594f3e7c70896ffeeef32b9c950ywan ENDP ; |vp9_iht8x8_64_add_neon| 697233d2500723e5594f3e7c70896ffeeef32b9c950ywan 698233d2500723e5594f3e7c70896ffeeef32b9c950ywan END 699