1///****************************************************************************** 2// * 3// * Copyright (C) 2018 The Android Open Source Project 4// * 5// * Licensed under the Apache License, Version 2.0 (the "License"); 6// * you may not use this file except in compliance with the License. 7// * You may obtain a copy of the License at: 8// * 9// * http://www.apache.org/licenses/LICENSE-2.0 10// * 11// * Unless required by applicable law or agreed to in writing, software 12// * distributed under the License is distributed on an "AS IS" BASIS, 13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14// * See the License for the specific language governing permissions and 15// * limitations under the License. 16// * 17// ***************************************************************************** 18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21 22.macro push_v_regs 23 stp q8, q9, [sp, #-32]! 24 stp q10, q11, [sp, #-32]! 25 stp q12, q13, [sp, #-32]! 26 stp q14, q15, [sp, #-32]! 27 stp X8, X9, [sp, #-16]! 28 stp X10, X11, [sp, #-16]! 29 stp X12, X13, [sp, #-16]! 30 stp X14, X15, [sp, #-16]! 31 stp X16, X17, [sp, #-16]! 32 stp X29, X30, [sp, #-16]! 33.endm 34.macro pop_v_regs 35 ldp X29, X30, [sp], #16 36 ldp X16, X17, [sp], #16 37 ldp X14, X15, [sp], #16 38 ldp X12, X13, [sp], #16 39 ldp X10, X11, [sp], #16 40 ldp X8, X9, [sp], #16 41 ldp q14, q15, [sp], #32 42 ldp q12, q13, [sp], #32 43 ldp q10, q11, [sp], #32 44 ldp q8, q9, [sp], #32 45.endm 46.text 47.global ixheaacd_over_lap_add2_armv8 48 49 50ixheaacd_over_lap_add2_armv8: 51 push_v_regs 52 MOV X8, X5 53 SUB X12, X5, #1 54 LSL X9, X5, #2 55 LSL X12, X12, #2 56 ADD X10, X0, X9 57 ADD X7, X1, X12 58 ADD X4, X4, #1 59 LD2 {V0.4H, V1.4H}, [X10], #16 60 LSL X11, X6, #2 61 SUB X7, X7, #12 62 SUB X4, X4, #16 63 MOV X12, #-16 64 MOV X13, #1 65 ADD X14, X4, #1 66 NEG X14, X14 67 DUP V21.4S, W4 68 LD2 {V6.4H, V7.4H}, [X7], X12 69 LSL X4, X13, X14 70 REV64 V4.4H, V6.4H 71 DUP V20.4S, W4 72 REV64 V5.4H, V7.4H 73 MOV X4, X3 74 75 MOV X9, X2 76 LD2 {V2.4H, V3.4H}, [X3], #16 77 78 UMULL V23.4S, V0.4H, V2.4H 79 UMLSL V23.4S, V4.4H, V3.4H 80 LD2 {V8.4H, V9.4H}, [X10], #16 81 SSHR V23.4S, V23.4S, #16 82 LD2 {V10.4H, V11.4H}, [X3], #16 83 SMLAL V23.4S, V1.4H, V2.4H 84 SMLSL V23.4S, V5.4H, V3.4H 85 LD2 {V14.4H, V15.4H}, [X7], X12 86 REV64 V12.4H, V14.4H 87 REV64 V13.4H, V15.4H 88 SQADD V22.4S, V23.4S, V20.4S 89 SSHL V22.4S, V22.4S, V21.4S 90 MOV V24.16B, V22.16B 91 SUB X8, X8, #8 92 93LOOP_1: 94 95 LD2 {V0.4H, V1.4H}, [X10], #16 96 UMULL V19.4S, V8.4H, V10.4H 97 LD2 {V2.4H, V3.4H}, [X3], #16 98 UMLSL V19.4S, V12.4H, V11.4H 99 LD2 {V6.4H, V7.4H}, [X7], X12 100 UMULL V23.4S, V0.4H, V2.4H 101 REV64 V4.4H, V6.4H 102 UMLSL V23.4S, V4.4H, V3.4H 103 REV64 V5.4H, V7.4H 104 SSHR V19.4S, V19.4S, #16 105 ST1 {V24.S}[0], [X2], X11 106 SMLAL V19.4S, V9.4H, V10.4H 107 ST1 {V24.S}[1], [X2], X11 108 SSHR V23.4S, V23.4S, #16 109 ST1 {V24.S}[2], [X2], X11 110 SMLAL V23.4S, V1.4H, V2.4H 111 112 ST1 {V24.S}[3], [X2], X11 113 SMLSL V19.4S, V13.4H, V11.4H 114 SMLSL V23.4S, V5.4H, V3.4H 115 116 LD2 {V8.4H, V9.4H}, [X10], #16 117 LD2 {V10.4H, V11.4H}, [X3], #16 118 119 120 LD2 {V14.4H, V15.4H}, [X7], X12 121 SQADD V18.4S, V19.4S, V20.4S 122 REV64 V12.4H, V14.4H 123 REV64 V13.4H, V15.4H 124 SQADD V22.4S, V23.4S, V20.4S 125 SSHL V18.4S, V18.4S, V21.4S 126 MOV V16.16B, V18.16B 127 ST1 {V16.S}[0], [X2], X11 128 SSHL V22.4S, V22.4S, V21.4S 129 130 131 MOV V24.16B, V22.16B 132 SUBS X8, X8, #8 133 134 ST1 {V16.S}[1], [X2], X11 135 ST1 {V16.S}[2], [X2], X11 136 ST1 {V16.S}[3], [X2], X11 137 138 139 BGT LOOP_1 140 141 142 ST1 {V24.S}[0], [X2], X11 143 UMULL V19.4S, V8.4H, V10.4H 144 UMLSL V19.4S, V12.4H, V11.4H 145 ST1 {V24.S}[1], [X2], X11 146 ST1 {V24.S}[2], [X2], X11 147 SSHR V19.4S, V19.4S, #16 148 ST1 {V24.S}[3], [X2], X11 149 SMLAL V19.4S, V9.4H, V10.4H 150 SMLSL V19.4S, V13.4H, V11.4H 151 MOV X12, #12 152 MOV V30.S[0], W5 153 MOV V31.S[0], W6 154 SMULL V29.4S, V30.4H, V31.4H 155 MOV W7, V29.S[0] 156 157 LSL W10, W5, #1 158 SQADD V18.4S, V19.4S, V20.4S 159 SSHL V18.4S, V18.4S, V21.4S 160 MOV V16.16B, V18.16B 161 162 ST1 {V16.S}[0], [X2], X11 163 LSL X7, X7, #2 164 165 ST1 {V16.S}[1], [X2], X11 166 ADD X7, X7, X9 167 168 ST1 {V16.S}[2], [X2], X11 169 ST1 {V16.S}[3], [X2], X11 170 171 SUB X11, X10, #1 172 LSL X10, X11, #2 173 ADD X10, X0, X10 174 LSL X11, X11, #1 175 SUB X10, X10, X12 176 LSL X8, X6, #2 177 MOV X12, #-16 178 ADD X11, X11, X4 179 180 LD1 {V6.4S}, [X10], X12 181 SUB X11, X11, #14 182 183 184 REV64 V0.4S, V6.4S 185 SQNEG V0.4S, V0.4S 186 187 188 UZP1 V1.8H, V0.8H, V0.8H 189 UZP2 V0.8H, V0.8H, V0.8H 190 REV64 V1.4S, V1.4S 191 REV64 V0.4S, V0.4S 192 LD2 {V2.4H, V3.4H}, [X11], X12 193 REV64 V2.4H, V2.4H 194 REV64 V3.4H, V3.4H 195 196 LD2 {V4.4H, V5.4H}, [X1], #16 197 198 UMULL V23.4S, V1.4H, V3.4H 199 UMLSL V23.4S, V4.4H, V2.4H 200 SSHR V23.4S, V23.4S, #16 201 SMLAL V23.4S, V0.4H, V3.4H 202 SMLSL V23.4S, V5.4H, V2.4H 203 SQADD V22.4S, V23.4S, V20.4S 204 SSHL V22.4S, V22.4S, V21.4S 205 MOV V24.16B, V22.16B 206 207 208 LD1 {V14.4S}, [X10], X12 209 UMULL V23.4S, V1.4H, V3.4H 210 UMLSL V23.4S, V4.4H, V2.4H 211 REV64 V8.4S, V14.4S 212 SQNEG V8.4S, V8.4S 213 LD2 {V10.4H, V11.4H}, [X11], X12 214 SSHR V23.4S, V23.4S, #16 215 LD2 {V12.4H, V13.4H}, [X1], #16 216 SMLAL V23.4S, V0.4H, V3.4H 217 SMLSL V23.4S, V5.4H, V2.4H 218 UZP1 V9.8H, V8.8H, V8.8H 219 UZP2 V8.8H, V8.8H, V8.8H 220 rev64 v9.4s, v9.4s 221 rev64 v8.4s, v8.4s 222 REV64 V10.4H, V10.4H 223 REV64 V11.4H, V11.4H 224 SQADD V22.4S, V23.4S, V20.4S 225 SUB X5, X5, #8 226 SSHL V22.4S, V22.4S, V21.4S 227 MOV V24.16B, V22.16B 228 229 230LOOP_2: 231 232 233 LD1 {V6.4S}, [X10], X12 234 UMULL V19.4S, V9.4H, V11.4H 235 REV64 V0.4S, V6.4S 236 SQNEG V0.4S, V0.4S 237 UZP1 V1.8H, V0.8H, V0.8H 238 UZP2 V0.8H, V0.8H, V0.8H 239 REV64 V1.4S, V1.4S 240 REV64 V0.4S, V0.4S 241 LD2 {V2.4H, V3.4H}, [X11], X12 242 REV64 V2.8H, V2.8H 243 REV64 V3.8H, V3.8H 244 245 LD2 {V4.4H, V5.4H}, [X1], #16 246 UMLSL V19.4S, V12.4H, V10.4H 247 ST1 {V24.S}[0], [X7], X8 248 UMULL V23.4S, V1.4H, V3.4H 249 ST1 {V24.S}[1], [X7], X8 250 SSHR V19.4S, V19.4S, #16 251 ST1 {V24.S}[2], [X7], X8 252 UMLSL V23.4S, V4.4H, V2.4H 253 ST1 {V24.S}[3], [X7], X8 254 SMLAL V19.4S, V8.4H, V11.4H 255 LD1 {V14.4S}, [X10], X12 256 SSHR V23.4S, V23.4S, #16 257 SMLSL V19.4S, V13.4H, V10.4H 258 LD2 {V10.4H, V11.4H}, [X11], X12 259 SMLAL V23.4S, V0.4H, V3.4H 260 SMLSL V23.4S, V5.4H, V2.4H 261 REV64 V8.4S, V14.4S 262 LD2 {V12.4H, V13.4H}, [X1], #16 263 SQNEG V8.4S, V8.4S 264 REV64 V11.4H, V11.4h 265 REV64 V10.4H, V10.4H 266 SQADD V18.4S, V19.4S, V20.4S 267 UZP1 V9.8H, V8.8H, V8.8H 268 UZP2 V8.8H, V8.8H, V8.8H 269 rev64 v9.4s, v9.4s 270 rev64 v8.4s, v8.4s 271 SQADD V22.4S, V23.4S, V20.4S 272 SSHL V18.4S, V18.4S, V21.4S 273 SUBS X5, X5, #8 274 MOV V16.16B, V18.16B 275 ST1 {V16.S}[0], [X7], X8 276 SSHL V22.4S, V22.4S, V21.4S 277 ST1 {V16.S}[1], [X7], X8 278 MOV V24.16B, V22.16B 279 280 ST1 {V16.S}[2], [X7], X8 281 ST1 {V16.S}[3], [X7], X8 282 283 BGT LOOP_2 284 285 ST1 {V24.S}[0], [X7], X8 286 UMULL V19.4S, V9.4H, V11.4H 287 UMLSL V19.4S, V12.4H, V10.4H 288 ST1 {V24.S}[1], [X7], X8 289 ST1 {V24.S}[2], [X7], X8 290 SSHR V19.4S, V19.4S, #16 291 ST1 {V24.S}[3], [X7], X8 292 293 SMLAL V19.4S, V8.4H, V11.4H 294 SMLSL V19.4S, V13.4H, V10.4H 295 SQADD V18.4S, V19.4S, V20.4S 296 SSHL V18.4S, V18.4S, V21.4S 297 MOV V16.16B, V18.16B 298 299 ST1 {V16.S}[0], [X7], X8 300 ST1 {V16.S}[1], [X7], X8 301 ST1 {V16.S}[2], [X7], X8 302 ST1 {V16.S}[3], [X7], X8 303 304 pop_v_regs 305 RET 306