1///******************************************************************************
2// *
3// * Copyright (C) 2018 The Android Open Source Project
4// *
5// * Licensed under the Apache License, Version 2.0 (the "License");
6// * you may not use this file except in compliance with the License.
7// * You may obtain a copy of the License at:
8// *
9// * http://www.apache.org/licenses/LICENSE-2.0
10// *
11// * Unless required by applicable law or agreed to in writing, software
12// * distributed under the License is distributed on an "AS IS" BASIS,
13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// * See the License for the specific language governing permissions and
15// * limitations under the License.
16// *
17// *****************************************************************************
18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21
22.macro push_v_regs
23    stp             q8, q9, [sp, #-32]!
24    stp             q10, q11, [sp, #-32]!
25    stp             q12, q13, [sp, #-32]!
26    stp             q14, q15, [sp, #-32]!
27    stp             X8, X9, [sp, #-16]!
28    stp             X10, X11, [sp, #-16]!
29    stp             X12, X13, [sp, #-16]!
30    stp             X14, X15, [sp, #-16]!
31    stp             X16, X17, [sp, #-16]!
32    stp             X29, X30, [sp, #-16]!
33.endm
34.macro pop_v_regs
35    ldp             X29, X30, [sp], #16
36    ldp             X16, X17, [sp], #16
37    ldp             X14, X15, [sp], #16
38    ldp             X12, X13, [sp], #16
39    ldp             X10, X11, [sp], #16
40    ldp             X8, X9, [sp], #16
41    ldp             q14, q15, [sp], #32
42    ldp             q12, q13, [sp], #32
43    ldp             q10, q11, [sp], #32
44    ldp             q8, q9, [sp], #32
45.endm
46.text
47.global ixheaacd_over_lap_add2_armv8
48
49
50ixheaacd_over_lap_add2_armv8:
51    push_v_regs
52    MOV             X8, X5
53    SUB             X12, X5, #1
54    LSL             X9, X5, #2
55    LSL             X12, X12, #2
56    ADD             X10, X0, X9
57    ADD             X7, X1, X12
58    ADD             X4, X4, #1
59    LD2             {V0.4H, V1.4H}, [X10], #16
60    LSL             X11, X6, #2
61    SUB             X7, X7, #12
62    SUB             X4, X4, #16
63    MOV             X12, #-16
64    MOV             X13, #1
65    ADD             X14, X4, #1
66    NEG             X14, X14
67    DUP             V21.4S, W4
68    LD2             {V6.4H, V7.4H}, [X7], X12
69    LSL             X4, X13, X14
70    REV64           V4.4H, V6.4H
71    DUP             V20.4S, W4
72    REV64           V5.4H, V7.4H
73    MOV             X4, X3
74
75    MOV             X9, X2
76    LD2             {V2.4H, V3.4H}, [X3], #16
77
78    UMULL           V23.4S, V0.4H, V2.4H
79    UMLSL           V23.4S, V4.4H, V3.4H
80    LD2             {V8.4H, V9.4H}, [X10], #16
81    SSHR            V23.4S, V23.4S, #16
82    LD2             {V10.4H, V11.4H}, [X3], #16
83    SMLAL           V23.4S, V1.4H, V2.4H
84    SMLSL           V23.4S, V5.4H, V3.4H
85    LD2             {V14.4H, V15.4H}, [X7], X12
86    REV64           V12.4H, V14.4H
87    REV64           V13.4H, V15.4H
88    SQADD           V22.4S, V23.4S, V20.4S
89    SSHL            V22.4S, V22.4S, V21.4S
90    MOV             V24.16B, V22.16B
91    SUB             X8, X8, #8
92
93LOOP_1:
94
95    LD2             {V0.4H, V1.4H}, [X10], #16
96    UMULL           V19.4S, V8.4H, V10.4H
97    LD2             {V2.4H, V3.4H}, [X3], #16
98    UMLSL           V19.4S, V12.4H, V11.4H
99    LD2             {V6.4H, V7.4H}, [X7], X12
100    UMULL           V23.4S, V0.4H, V2.4H
101    REV64           V4.4H, V6.4H
102    UMLSL           V23.4S, V4.4H, V3.4H
103    REV64           V5.4H, V7.4H
104    SSHR            V19.4S, V19.4S, #16
105    ST1             {V24.S}[0], [X2], X11
106    SMLAL           V19.4S, V9.4H, V10.4H
107    ST1             {V24.S}[1], [X2], X11
108    SSHR            V23.4S, V23.4S, #16
109    ST1             {V24.S}[2], [X2], X11
110    SMLAL           V23.4S, V1.4H, V2.4H
111
112    ST1             {V24.S}[3], [X2], X11
113    SMLSL           V19.4S, V13.4H, V11.4H
114    SMLSL           V23.4S, V5.4H, V3.4H
115
116    LD2             {V8.4H, V9.4H}, [X10], #16
117    LD2             {V10.4H, V11.4H}, [X3], #16
118
119
120    LD2             {V14.4H, V15.4H}, [X7], X12
121    SQADD           V18.4S, V19.4S, V20.4S
122    REV64           V12.4H, V14.4H
123    REV64           V13.4H, V15.4H
124    SQADD           V22.4S, V23.4S, V20.4S
125    SSHL            V18.4S, V18.4S, V21.4S
126    MOV             V16.16B, V18.16B
127    ST1             {V16.S}[0], [X2], X11
128    SSHL            V22.4S, V22.4S, V21.4S
129
130
131    MOV             V24.16B, V22.16B
132    SUBS            X8, X8, #8
133
134    ST1             {V16.S}[1], [X2], X11
135    ST1             {V16.S}[2], [X2], X11
136    ST1             {V16.S}[3], [X2], X11
137
138
139    BGT             LOOP_1
140
141
142    ST1             {V24.S}[0], [X2], X11
143    UMULL           V19.4S, V8.4H, V10.4H
144    UMLSL           V19.4S, V12.4H, V11.4H
145    ST1             {V24.S}[1], [X2], X11
146    ST1             {V24.S}[2], [X2], X11
147    SSHR            V19.4S, V19.4S, #16
148    ST1             {V24.S}[3], [X2], X11
149    SMLAL           V19.4S, V9.4H, V10.4H
150    SMLSL           V19.4S, V13.4H, V11.4H
151    MOV             X12, #12
152    MOV             V30.S[0], W5
153    MOV             V31.S[0], W6
154    SMULL           V29.4S, V30.4H, V31.4H
155    MOV             W7, V29.S[0]
156
157    LSL             W10, W5, #1
158    SQADD           V18.4S, V19.4S, V20.4S
159    SSHL            V18.4S, V18.4S, V21.4S
160    MOV             V16.16B, V18.16B
161
162    ST1             {V16.S}[0], [X2], X11
163    LSL             X7, X7, #2
164
165    ST1             {V16.S}[1], [X2], X11
166    ADD             X7, X7, X9
167
168    ST1             {V16.S}[2], [X2], X11
169    ST1             {V16.S}[3], [X2], X11
170
171    SUB             X11, X10, #1
172    LSL             X10, X11, #2
173    ADD             X10, X0, X10
174    LSL             X11, X11, #1
175    SUB             X10, X10, X12
176    LSL             X8, X6, #2
177    MOV             X12, #-16
178    ADD             X11, X11, X4
179
180    LD1             {V6.4S}, [X10], X12
181    SUB             X11, X11, #14
182
183
184    REV64           V0.4S, V6.4S
185    SQNEG           V0.4S, V0.4S
186
187
188    UZP1            V1.8H, V0.8H, V0.8H
189    UZP2            V0.8H, V0.8H, V0.8H
190    REV64           V1.4S, V1.4S
191    REV64           V0.4S, V0.4S
192    LD2             {V2.4H, V3.4H}, [X11], X12
193    REV64           V2.4H, V2.4H
194    REV64           V3.4H, V3.4H
195
196    LD2             {V4.4H, V5.4H}, [X1], #16
197
198    UMULL           V23.4S, V1.4H, V3.4H
199    UMLSL           V23.4S, V4.4H, V2.4H
200    SSHR            V23.4S, V23.4S, #16
201    SMLAL           V23.4S, V0.4H, V3.4H
202    SMLSL           V23.4S, V5.4H, V2.4H
203    SQADD           V22.4S, V23.4S, V20.4S
204    SSHL            V22.4S, V22.4S, V21.4S
205    MOV             V24.16B, V22.16B
206
207
208    LD1             {V14.4S}, [X10], X12
209    UMULL           V23.4S, V1.4H, V3.4H
210    UMLSL           V23.4S, V4.4H, V2.4H
211    REV64           V8.4S, V14.4S
212    SQNEG           V8.4S, V8.4S
213    LD2             {V10.4H, V11.4H}, [X11], X12
214    SSHR            V23.4S, V23.4S, #16
215    LD2             {V12.4H, V13.4H}, [X1], #16
216    SMLAL           V23.4S, V0.4H, V3.4H
217    SMLSL           V23.4S, V5.4H, V2.4H
218    UZP1            V9.8H, V8.8H, V8.8H
219    UZP2            V8.8H, V8.8H, V8.8H
220    rev64           v9.4s, v9.4s
221    rev64           v8.4s, v8.4s
222    REV64           V10.4H, V10.4H
223    REV64           V11.4H, V11.4H
224    SQADD           V22.4S, V23.4S, V20.4S
225    SUB             X5, X5, #8
226    SSHL            V22.4S, V22.4S, V21.4S
227    MOV             V24.16B, V22.16B
228
229
230LOOP_2:
231
232
233    LD1             {V6.4S}, [X10], X12
234    UMULL           V19.4S, V9.4H, V11.4H
235    REV64           V0.4S, V6.4S
236    SQNEG           V0.4S, V0.4S
237    UZP1            V1.8H, V0.8H, V0.8H
238    UZP2            V0.8H, V0.8H, V0.8H
239    REV64           V1.4S, V1.4S
240    REV64           V0.4S, V0.4S
241    LD2             {V2.4H, V3.4H}, [X11], X12
242    REV64           V2.8H, V2.8H
243    REV64           V3.8H, V3.8H
244
245    LD2             {V4.4H, V5.4H}, [X1], #16
246    UMLSL           V19.4S, V12.4H, V10.4H
247    ST1             {V24.S}[0], [X7], X8
248    UMULL           V23.4S, V1.4H, V3.4H
249    ST1             {V24.S}[1], [X7], X8
250    SSHR            V19.4S, V19.4S, #16
251    ST1             {V24.S}[2], [X7], X8
252    UMLSL           V23.4S, V4.4H, V2.4H
253    ST1             {V24.S}[3], [X7], X8
254    SMLAL           V19.4S, V8.4H, V11.4H
255    LD1             {V14.4S}, [X10], X12
256    SSHR            V23.4S, V23.4S, #16
257    SMLSL           V19.4S, V13.4H, V10.4H
258    LD2             {V10.4H, V11.4H}, [X11], X12
259    SMLAL           V23.4S, V0.4H, V3.4H
260    SMLSL           V23.4S, V5.4H, V2.4H
261    REV64           V8.4S, V14.4S
262    LD2             {V12.4H, V13.4H}, [X1], #16
263    SQNEG           V8.4S, V8.4S
264    REV64           V11.4H, V11.4h
265    REV64           V10.4H, V10.4H
266    SQADD           V18.4S, V19.4S, V20.4S
267    UZP1            V9.8H, V8.8H, V8.8H
268    UZP2            V8.8H, V8.8H, V8.8H
269    rev64           v9.4s, v9.4s
270    rev64           v8.4s, v8.4s
271    SQADD           V22.4S, V23.4S, V20.4S
272    SSHL            V18.4S, V18.4S, V21.4S
273    SUBS            X5, X5, #8
274    MOV             V16.16B, V18.16B
275    ST1             {V16.S}[0], [X7], X8
276    SSHL            V22.4S, V22.4S, V21.4S
277    ST1             {V16.S}[1], [X7], X8
278    MOV             V24.16B, V22.16B
279
280    ST1             {V16.S}[2], [X7], X8
281    ST1             {V16.S}[3], [X7], X8
282
283    BGT             LOOP_2
284
285    ST1             {V24.S}[0], [X7], X8
286    UMULL           V19.4S, V9.4H, V11.4H
287    UMLSL           V19.4S, V12.4H, V10.4H
288    ST1             {V24.S}[1], [X7], X8
289    ST1             {V24.S}[2], [X7], X8
290    SSHR            V19.4S, V19.4S, #16
291    ST1             {V24.S}[3], [X7], X8
292
293    SMLAL           V19.4S, V8.4H, V11.4H
294    SMLSL           V19.4S, V13.4H, V10.4H
295    SQADD           V18.4S, V19.4S, V20.4S
296    SSHL            V18.4S, V18.4S, V21.4S
297    MOV             V16.16B, V18.16B
298
299    ST1             {V16.S}[0], [X7], X8
300    ST1             {V16.S}[1], [X7], X8
301    ST1             {V16.S}[2], [X7], X8
302    ST1             {V16.S}[3], [X7], X8
303
304    pop_v_regs
305    RET
306