1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT  |vp9_iht8x8_64_add_neon|
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ARM
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan    REQUIRE8
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan    PRESERVE8
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan    AREA ||.text||, CODE, READONLY, ALIGN=2
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Generate IADST constants in r0 - r12 for the IADST.
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MACRO
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GENERATE_IADST_CONSTANTS
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate  cospi_2_64  = 16305
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r0, #0x3f00
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r0, #0xb1
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_30_64 = 1606
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r1, #0x600
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r1, #0x46
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_10_64 = 14449
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r2, #0x3800
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r2, #0x71
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_22_64 = 7723
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r3, #0x1e00
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r3, #0x2b
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_18_64 = 10394
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r4, #0x2800
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r4, #0x9a
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_14_64 = 12665
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r5, #0x3100
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r5, #0x79
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_26_64 = 4756
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r6, #0x1200
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r6, #0x94
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_6_64  = 15679
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r7, #0x3d00
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r7, #0x3f
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_8_64  = 15137
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r8, #0x3b00
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r8, #0x21
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_24_64 = 6270
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r9, #0x1800
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r9, #0x7e
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate 0
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r10, #0
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate  cospi_16_64 = 11585
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r12, #0x2d00
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r12, #0x41
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MEND
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Generate IDCT constants in r3 - r9 for the IDCT.
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MACRO
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GENERATE_IDCT_CONSTANTS
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate  cospi_28_64 = 3196
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r3, #0x0c00
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r3, #0x7c
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_4_64  = 16069
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r4, #0x3e00
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r4, #0xc5
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_12_64 = 13623
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r5, #0x3500
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r5, #0x37
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_20_64 = 9102
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r6, #0x2300
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r6, #0x8e
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_16_64 = 11585
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r7, #0x2d00
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r7, #0x41
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_24_64 = 6270
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r8, #0x1800
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r8, #0x7e
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate cospi_8_64 = 15137
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r9, #0x3b00
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r9, #0x21
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MEND
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MACRO
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE8X8
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vswp            d17, d24
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vswp            d23, d30
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vswp            d21, d28
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vswp            d19, d26
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32         q8, q10
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32         q9, q11
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32         q12, q14
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32         q13, q15
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16         q8, q9
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16         q10, q11
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16         q12, q13
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16         q14, q15
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MEND
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; registers and use them as buffer during calculation.
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MACRO
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IDCT8x8_1D
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; stage 1
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d0, r3                    ; duplicate cospi_28_64
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d1, r4                    ; duplicate cospi_4_64
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d2, r5                    ; duplicate cospi_12_64
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d3, r6                    ; duplicate cospi_20_64
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[1] * cospi_28_64
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q2, d18, d0
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q3, d19, d0
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[5] * cospi_12_64
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q5, d26, d2
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q6, d27, d2
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[1]*cospi_28_64-input[7]*cospi_4_64
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q2, d30, d1
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q3, d31, d1
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q5, d22, d3
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q6, d23, d3
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dct_const_round_shift(input_dc * cospi_16_64)
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d8, q2, #14               ; >> 14
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d9, q3, #14               ; >> 14
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dct_const_round_shift(input_dc * cospi_16_64)
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d10, q5, #14              ; >> 14
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d11, q6, #14              ; >> 14
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[1] * cospi_4_64
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q2, d18, d1
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q3, d19, d1
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[5] * cospi_20_64
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q9, d26, d3
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q13, d27, d3
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[1]*cospi_4_64+input[7]*cospi_28_64
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q2, d30, d0
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q3, d31, d0
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q9, d22, d2
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q13, d23, d2
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dct_const_round_shift(input_dc * cospi_16_64)
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d14, q2, #14              ; >> 14
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d15, q3, #14              ; >> 14
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; stage 2 & stage 3 - even half
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d0, r7                    ; duplicate cospi_16_64
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dct_const_round_shift(input_dc * cospi_16_64)
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d12, q9, #14              ; >> 14
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d13, q13, #14             ; >> 14
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[0] * cospi_16_64
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q2, d16, d0
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q3, d17, d0
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[0] * cospi_16_64
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q13, d16, d0
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q15, d17, d0
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (input[0] + input[2]) * cospi_16_64
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q2,  d24, d0
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q3, d25, d0
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (input[0] - input[2]) * cospi_16_64
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q13, d24, d0
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q15, d25, d0
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d0, r8                    ; duplicate cospi_24_64
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d1, r9                    ; duplicate cospi_8_64
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dct_const_round_shift(input_dc * cospi_16_64)
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d18, q2, #14              ; >> 14
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d19, q3, #14              ; >> 14
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dct_const_round_shift(input_dc * cospi_16_64)
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d22, q13, #14             ; >> 14
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d23, q15, #14             ; >> 14
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[1] * cospi_24_64
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q2, d20, d0
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q3, d21, d0
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[1] * cospi_8_64
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q8, d20, d1
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q12, d21, d1
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q2, d28, d1
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q3, d29, d1
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q8, d28, d0
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q12, d29, d0
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dct_const_round_shift(input_dc * cospi_16_64)
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d26, q2, #14              ; >> 14
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d27, q3, #14              ; >> 14
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dct_const_round_shift(input_dc * cospi_16_64)
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d30, q8, #14              ; >> 14
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d31, q12, #14             ; >> 14
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; stage 3 -odd half
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d16, r7                   ; duplicate cospi_16_64
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; stage 2 - odd half
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; step2[6] * cospi_16_64
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q9, d28, d16
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q10, d29, d16
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; step2[6] * cospi_16_64
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q11, d28, d16
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q12, d29, d16
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (step2[6] - step2[5]) * cospi_16_64
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q9, d26, d16
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q10, d27, d16
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (step2[5] + step2[6]) * cospi_16_64
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q11, d26, d16
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q12, d27, d16
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dct_const_round_shift(input_dc * cospi_16_64)
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d10, q9, #14              ; >> 14
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d11, q10, #14             ; >> 14
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dct_const_round_shift(input_dc * cospi_16_64)
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d12, q11, #14             ; >> 14
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d13, q12, #14             ; >> 14
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; stage 4
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MEND
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; output will be stored back into q8-q15 registers. This macro will touch
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; q0 - q7 registers and use them as buffer during calculation.
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MACRO
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IADST8X8_1D
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d14, r0                   ; duplicate cospi_2_64
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d15, r1                   ; duplicate cospi_30_64
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_2_64  * x0
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q1, d30, d14
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q2, d31, d14
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_30_64 * x0
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q3, d30, d15
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q4, d31, d15
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d30, r4                   ; duplicate cospi_18_64
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d31, r5                   ; duplicate cospi_14_64
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q1, d16, d15
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q2, d17, d15
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q3, d16, d14
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q4, d17, d14
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_18_64 * x4
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q5, d22, d30
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q6, d23, d30
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_14_64 * x4
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q7, d22, d31
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q8, d23, d31
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q5, d24, d31
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q6, d25, d31
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q7, d24, d30
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q8, d25, d30
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s0 + s4)
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q11, q1, q5
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q12, q2, q6
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d0, r2                   ; duplicate cospi_10_64
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d1, r3                   ; duplicate cospi_22_64
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s0 - s4)
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q1, q1, q5
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q2, q2, q6
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x0 = dct_const_round_shift(s0 + s4);
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d22, q11, #14             ; >> 14
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d23, q12, #14             ; >> 14
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s1 + s5)
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q12, q3, q7
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q15, q4, q8
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s1 - s5)
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q3, q3, q7
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q4, q4, q8
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x4 = dct_const_round_shift(s0 - s4);
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d2, q1, #14               ; >> 14
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d3, q2, #14               ; >> 14
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x1 = dct_const_round_shift(s1 + s5);
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d24, q12, #14             ; >> 14
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d25, q15, #14             ; >> 14
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x5 = dct_const_round_shift(s1 - s5);
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d6, q3, #14               ; >> 14
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d7, q4, #14               ; >> 14
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_10_64 * x2
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q4, d26, d0
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q5, d27, d0
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_22_64 * x2
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q2, d26, d1
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q6, d27, d1
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d30, r6                   ; duplicate cospi_26_64
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d31, r7                   ; duplicate cospi_6_64
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q4, d20, d1
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q5, d21, d1
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q2, d20, d0
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q6, d21, d0
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_26_64 * x6
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q0, d18, d30
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q13, d19, d30
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q0, d28, d31
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q13, d29, d31
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_6_64  * x6
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q10, d18, d31
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q9, d19, d31
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q10, d28, d30
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q9, d29, d30
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s3 + s7)
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q14, q2, q10
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q15, q6, q9
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s3 - s7)
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q2, q2, q10
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q6, q6, q9
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x3 = dct_const_round_shift(s3 + s7);
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d28, q14, #14             ; >> 14
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d29, q15, #14             ; >> 14
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x7 = dct_const_round_shift(s3 - s7);
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d4, q2, #14               ; >> 14
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d5, q6, #14               ; >> 14
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s2 + s6)
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q9, q4, q0
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q10, q5, q13
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s2 - s6)
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q4, q4, q0
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q5, q5, q13
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d30, r8                   ; duplicate cospi_8_64
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d31, r9                   ; duplicate cospi_24_64
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x2 = dct_const_round_shift(s2 + s6);
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d18, q9, #14              ; >> 14
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d19, q10, #14             ; >> 14
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x6 = dct_const_round_shift(s2 - s6);
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d8, q4, #14               ; >> 14
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d9, q5, #14               ; >> 14
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_8_64  * x4
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q5, d2, d30
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q6, d3, d30
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_24_64 * x4
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q7, d2, d31
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q0, d3, d31
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q5, d6, d31
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q6, d7, d31
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q7, d6, d30
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q0, d7, d30
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_8_64  * x7
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q1, d4, d30
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q3, d5, d30
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_24_64 * x7
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q10, d4, d31
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q2, d5, d31
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q1, d8, d31
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q3, d9, d31
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q10, d8, d30
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q2, d9, d30
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s4 + s6)
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q14, q5, q1
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q15, q6, q3
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s4 - s6)
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q5, q5, q1
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q6, q6, q3
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x4 = dct_const_round_shift(s4 + s6);
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d18, q14, #14             ; >> 14
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d19, q15, #14             ; >> 14
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x6 = dct_const_round_shift(s4 - s6);
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d10, q5, #14              ; >> 14
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d11, q6, #14              ; >> 14
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s5 + s7)
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q1, q7, q10
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s32        q3, q0, q2
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (s5 - s7))
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q7, q7, q10
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s32        q0, q0, q2
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x5 = dct_const_round_shift(s5 + s7);
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d28, q1, #14               ; >> 14
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d29, q3, #14               ; >> 14
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x7 = dct_const_round_shift(s5 - s7);
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d14, q7, #14              ; >> 14
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d15, q0, #14              ; >> 14
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         d30, r12                  ; duplicate cospi_16_64
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_16_64 * x2
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q2, d22, d30
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q3, d23, d30
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_6_64  * x6
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q13, d22, d30
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q1, d23, d30
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_16_64 * x2 + cospi_16_64  * x3;
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q2, d24, d30
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q3, d25, d30
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_16_64 * x2 - cospi_16_64  * x3;
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q13, d24, d30
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q1, d25, d30
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x2 = dct_const_round_shift(s2);
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d4, q2, #14               ; >> 14
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d5, q3, #14               ; >> 14
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;x3 = dct_const_round_shift(s3);
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d24, q13, #14             ; >> 14
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d25, q1, #14              ; >> 14
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_16_64 * x6
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q13, d10, d30
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q1, d11, d30
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_6_64  * x6
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q11, d10, d30
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16       q0, d11, d30
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_16_64 * x6 + cospi_16_64  * x7;
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q13, d14, d30
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q1, d15, d30
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; cospi_16_64 * x6 - cospi_16_64  * x7;
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q11, d14, d30
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.s16       q0, d15, d30
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x6 = dct_const_round_shift(s6);
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d20, q13, #14             ; >> 14
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d21, q1, #14              ; >> 14
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;x7 = dct_const_round_shift(s7);
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d12, q11, #14             ; >> 14
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.s32     d13, q0, #14              ; >> 14
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.16         q5, r10                   ; duplicate 0
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q9, q5, q9                ; output[1] = -x4;
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q11, q5, q2               ; output[3] = -x2;
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q13, q5, q6               ; output[5] = -x7;
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s16        q15, q5, q4               ; output[7] = -x1;
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MEND
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan    AREA     Block, CODE, READONLY ; name this block of code
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                               int dest_stride, int tx_type)
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0  int16_t input
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1  uint8_t *dest
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2  int dest_stride
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3  int tx_type)
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan; This function will only handle tx_type of 1,2,3.
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_iht8x8_64_add_neon| PROC
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; load the inputs into d16-d19
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.s16        {q8,q9}, [r0]!
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.s16        {q10,q11}, [r0]!
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.s16        {q12,q13}, [r0]!
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.s16        {q14,q15}, [r0]!
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push            {r0-r10}
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpush           {d8-d15}
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose the input data
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE8X8
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; decide the type of transform
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp         r3, #2
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         idct_iadst
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp         r3, #3
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         iadst_iadst
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan
590233d2500723e5594f3e7c70896ffeeef32b9c950ywaniadst_idct
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate IDCT constants
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GENERATE_IDCT_CONSTANTS
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; first transform rows
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IDCT8x8_1D
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose the matrix
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE8X8
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate IADST constants
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GENERATE_IADST_CONSTANTS
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; then transform columns
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IADST8X8_1D
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b end_vp9_iht8x8_64_add_neon
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan
608233d2500723e5594f3e7c70896ffeeef32b9c950ywanidct_iadst
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate IADST constants
610233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GENERATE_IADST_CONSTANTS
611233d2500723e5594f3e7c70896ffeeef32b9c950ywan
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; first transform rows
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IADST8X8_1D
614233d2500723e5594f3e7c70896ffeeef32b9c950ywan
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose the matrix
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE8X8
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate IDCT constants
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GENERATE_IDCT_CONSTANTS
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; then transform columns
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IDCT8x8_1D
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b end_vp9_iht8x8_64_add_neon
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan
626233d2500723e5594f3e7c70896ffeeef32b9c950ywaniadst_iadst
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; generate IADST constants
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GENERATE_IADST_CONSTANTS
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; first transform rows
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IADST8X8_1D
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose the matrix
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE8X8
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; then transform columns
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IADST8X8_1D
638233d2500723e5594f3e7c70896ffeeef32b9c950ywan
639233d2500723e5594f3e7c70896ffeeef32b9c950ywanend_vp9_iht8x8_64_add_neon
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpop           {d8-d15}
641233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop            {r0-r10}
642233d2500723e5594f3e7c70896ffeeef32b9c950ywan
643233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vrshr.s16       q8, q8, #5
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vrshr.s16       q9, q9, #5
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vrshr.s16       q10, q10, #5
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vrshr.s16       q11, q11, #5
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vrshr.s16       q12, q12, #5
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vrshr.s16       q13, q13, #5
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vrshr.s16       q14, q14, #5
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vrshr.s16       q15, q15, #5
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan
653233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; save dest pointer
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r0, r1
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; load destination data
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.64         {d0}, [r1], r2
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.64         {d1}, [r1], r2
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.64         {d2}, [r1], r2
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.64         {d3}, [r1], r2
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.64         {d4}, [r1], r2
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.64         {d5}, [r1], r2
663233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.64         {d6}, [r1], r2
664233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.64         {d7}, [r1]
665233d2500723e5594f3e7c70896ffeeef32b9c950ywan
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8        q8, q8, d0
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8        q9, q9, d1
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8        q10, q10, d2
670233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8        q11, q11, d3
671233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8        q12, q12, d4
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8        q13, q13, d5
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8        q14, q14, d6
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8        q15, q15, d7
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; clip_pixel
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovun.s16     d0, q8
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovun.s16     d1, q9
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovun.s16     d2, q10
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovun.s16     d3, q11
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovun.s16     d4, q12
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovun.s16     d5, q13
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovun.s16     d6, q14
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovun.s16     d7, q15
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; store the data
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.64         {d0}, [r0], r2
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.64         {d1}, [r0], r2
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.64         {d2}, [r0], r2
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.64         {d3}, [r0], r2
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.64         {d4}, [r0], r2
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.64         {d5}, [r0], r2
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.64         {d6}, [r0], r2
694233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.64         {d7}, [r0], r2
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bx          lr
696233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP  ; |vp9_iht8x8_64_add_neon|
697233d2500723e5594f3e7c70896ffeeef32b9c950ywan
698233d2500723e5594f3e7c70896ffeeef32b9c950ywan    END
699