rsCpuIntrinsics_advsimd_Blur.S revision ea76eb386a2d851d50be69ebeb7ae593f84a5be9
1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18#define PRIVATE(f) .text; .align 4; .type f,#function; f: 19#define END(f) .size f, .-f; 20 21.set FRACTION_BITS, 7 22.set MAX_R, 25 23 24 25/* A quick way of making a line of code conditional on some other condition. 26 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with 27 * `ifcc`: 28 */ 29.macro ifcc zzz:vararg 30.if cc 31 \zzz 32.endif 33.endm 34 35/* Fetch 16 columns of bytes (regardless of image format), convolve these 36 * vertically, and leave them in the register file. If working near the top or 37 * bottom of an image then clamp the addressing while loading the data in. 38 * 39 * The convolution is fully unrolled for windows up to max_r, with the 40 * outermost edges calculated first. This way it's possible to branch directly 41 * into the relevant part of the code for an arbitrary convolution radius. Two 42 * variants of the loop are produced; one eliminates the clamping code for a 43 * slight speed advantage. 44 * 45 * Where the macro is called with reg=x, the specified register is taken to 46 * contain a pre-calculated pointer into one of the two loops. 47 * 48 * Input: 49 * x1 -- src 50 * x2 -- pitch 51 * x5 -- r 52 * x6 -- rup 53 * x7 -- rdn 54 * x12 -- switch index 55 * q0-q3 -- coefficient table 56 * x13 = -pitch 57 * x15 = top-row in 58 * x19 = bottom-row in 59 * Output: 60 * x1 += 16 61 * q10,q11 -- 16 convolved columns 62 * Modifies: 63 * x10 = upper row pointer 64 * x11 = lower row pointer 65 * q12-q15 = temporary sums 66 */ 67.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/ 68 .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif 69 70 ld1 {v15.16b}, [x1], #16 71 mov x10, x15 72 73 uxtl v14.8h, v15.8b 74// prfm PLDL1KEEP,[x1, #16] // TODO: confirm 75 uxtl2 v15.8h, v15.16b 76 .if \max_r < 16 // approximate 77 ifcc adr \reg, 1f 78 .else 79 ifcc adrp \reg, 1f 80 ifcc add \reg, \reg, #:lo12:1f 81 .endif 82 83 umull v12.4s, v14.4h, v0.h[0] 84 ifcc sub \reg, \reg, x5, LSL #6 85 umull2 v13.4s, v14.8h, v0.h[0] 86 mov x11, x19 87 umull v14.4s, v15.4h, v0.h[0] 88 ifcc add \reg, \reg, x5, LSL #3 89 umull2 v15.4s, v15.8h, v0.h[0] 90 br \reg 91 92 .irp rowclamp, 1, 0 93 .set cc, \rowclamp 94 .align 4 95 .irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h 96 .set i, \dreg * 8 + \lane 97 .if 0 < i && i <= \max_r 98 ld1 {v10.16b}, [x10], x2 99 ifcc cmp x6, #i 100 ld1 {v11.16b}, [x11], x13 101 ifcc csel x10, x15, x10, lo 102 uaddl v16.8h, v10.8b, v11.8b 103 ifcc cmp x7, #i 104 uaddl2 v11.8h, v10.16b, v11.16b 105 ifcc csel x11, x19, x11, lo 106 umlal v12.4s, v16.4h, v\dreg\doth[\lane] 107 umlal2 v13.4s, v16.8h, v\dreg\doth[\lane] 108// prfm PLDL1KEEP,[x10, #32] // TODO: confirm 109nop 110 umlal v14.4s, v11.4h, v\dreg\doth[\lane] 111// prfm PLDL1KEEP,[x11, #32] // TODO: confirm 112nop 113 umlal2 v15.4s, v11.8h, v\dreg\doth[\lane] 114 .endif 115 .endr ; .endr ; .endr 116 .if \rowclamp == 1 117 1: \labelc : 118 b 2f 119 .else 120 2: \labelnc : 121 .endif 122 .endr 123 124 uqrshrn v10.4h, v12.4s, #16 - FRACTION_BITS 125 add x15, x15, #16 126 uqrshrn2 v10.8h, v13.4s, #16 - FRACTION_BITS 127 add x19, x19, #16 128 uqrshrn v11.4h, v14.4s, #16 - FRACTION_BITS 129 uqrshrn2 v11.8h, v15.4s, #16 - FRACTION_BITS 130.endm /*}}}*/ 131 132/* Some portion of the convolution window (as much as will fit, and all of it 133 * for the uchar1 cases) is kept in the register file to avoid unnecessary 134 * memory accesses. This forces the horizontal loops to be unrolled because 135 * there's no indexed addressing into the register file. 136 * 137 * As in the fetch macro, the operations are ordered from outside to inside, so 138 * that jumping into the middle of the block bypasses the unwanted window taps. 139 * 140 * There are several variants of the macro because of the fixed offets of the 141 * taps -- the wider the maximum radius the further the centre tap is from the 142 * most recently fetched data. This means that pre-filling the window requires 143 * more data that won't be used and it means that rotating the window involves 144 * more mov operations. 145 * 146 * When the buffer gets too big the buffer at [x9] is used. 147 * 148 * Input: 149 * q4-q11 -- convoltion window 150 * x9 -- pointer to additional convolution window data 151 * Output: 152 * x9 -- updated buffer pointer (if used) 153 * d31 -- result to be stored 154 * Modifies: 155 * x12 -- temp buffer pointer 156 * q12-q13 -- temporaries for load and vext operations. 157 * q14-q15 -- intermediate sums 158 */ 159#define TUNED_LIST1 8, 16 160.macro hconv1_8/*{{{*/ 161 umull v14.4s, v9.4h, v0.h[0] 162 umull2 v15.4s, v9.8h, v0.h[0] 163 164 adr x16, 100f 165 ldrsh x12, [x16, x5, LSL #1] 166 add x12, x12, x16 167 br x12 168 100: .hword -4 169 .hword 101f-100b 170 .hword 102f-100b 171 .hword 103f-100b 172 .hword 104f-100b 173 .hword 105f-100b 174 .hword 106f-100b 175 .hword 107f-100b 176 .hword 108f-100b 177 .align 4 178 108: umlal v14.4s, v8.4h, v1.h[0] 179 umlal2 v15.4s, v8.8h, v1.h[0] 180 umlal v14.4s, v10.4h, v1.h[0] 181 umlal2 v15.4s, v10.8h, v1.h[0] 182 107: ext v12.16b, v8.16b, v9.16b, #1*2 183 ext v13.16b, v9.16b, v10.16b, #7*2 184 umlal v14.4s, v12.4h, v0.h[7] 185 umlal2 v15.4s, v12.8h, v0.h[7] 186 umlal v14.4s, v13.4h, v0.h[7] 187 umlal2 v15.4s, v13.8h, v0.h[7] 188 106: ext v12.16b, v8.16b, v9.16b, #2*2 189 ext v13.16b, v9.16b, v10.16b, #6*2 190 umlal v14.4s, v12.4h, v0.h[6] 191 umlal2 v15.4s, v12.8h, v0.h[6] 192 umlal v14.4s, v13.4h, v0.h[6] 193 umlal2 v15.4s, v13.8h, v0.h[6] 194 105: ext v12.16b, v8.16b, v9.16b, #3*2 195 ext v13.16b, v9.16b, v10.16b, #5*2 196 umlal v14.4s, v12.4h, v0.h[5] 197 umlal2 v15.4s, v12.8h, v0.h[5] 198 umlal v14.4s, v13.4h, v0.h[5] 199 umlal2 v15.4s, v13.8h, v0.h[5] 200 104: //ext v12.16b, v8.16b, v9.16b, #4*2 201 //ext v13.16b, v9.16b, v10.16b, #4*2 202 umlal2 v14.4s, v8.8h, v0.h[4] 203 umlal v15.4s, v9.4h, v0.h[4] 204 umlal2 v14.4s, v9.8h, v0.h[4] 205 umlal v15.4s, v10.4h, v0.h[4] 206 103: ext v12.16b, v8.16b, v9.16b, #5*2 207 ext v13.16b, v9.16b, v10.16b, #3*2 208 umlal v14.4s, v12.4h, v0.h[3] 209 umlal2 v15.4s, v12.8h, v0.h[3] 210 umlal v14.4s, v13.4h, v0.h[3] 211 umlal2 v15.4s, v13.8h, v0.h[3] 212 102: ext v12.16b, v8.16b, v9.16b, #6*2 213 ext v13.16b, v9.16b, v10.16b, #2*2 214 umlal v14.4s, v12.4h, v0.h[2] 215 umlal2 v15.4s, v12.8h, v0.h[2] 216 umlal v14.4s, v13.4h, v0.h[2] 217 umlal2 v15.4s, v13.8h, v0.h[2] 218 101: ext v12.16b, v8.16b, v9.16b, #7*2 219 ext v13.16b, v9.16b, v10.16b, #1*2 220 umlal v14.4s, v12.4h, v0.h[1] 221 umlal2 v15.4s, v12.8h, v0.h[1] 222 umlal v14.4s, v13.4h, v0.h[1] 223 umlal2 v15.4s, v13.8h, v0.h[1] 224 225 uqrshrn v14.4h, v14.4s, #16 226 uqrshrn2 v14.8h, v15.4s, #16 227 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 228 229 mov v8.16b, v9.16b 230 mov v9.16b, v10.16b 231 mov v10.16b, v11.16b 232.endm/*}}}*/ 233 234.macro hconv1_16/*{{{*/ 235 umull v14.4s, v8.4h, v0.h[0] 236 umull2 v15.4s, v8.8h, v0.h[0] 237 238 adr x16, 100f 239 ldrsh x12, [x16, x5, LSL #1] 240 add x12, x12, x16 241 br x12 242 100: .hword -4 243 .hword 101f-100b 244 .hword 102f-100b 245 .hword 103f-100b 246 .hword 104f-100b 247 .hword 105f-100b 248 .hword 106f-100b 249 .hword 107f-100b 250 .hword 108f-100b 251 .hword 109f-100b 252 .hword 110f-100b 253 .hword 111f-100b 254 .hword 112f-100b 255 .hword 113f-100b 256 .hword 114f-100b 257 .hword 115f-100b 258 .hword 116f-100b 259 .align 4 260 116: //ext v12.16b, v6.16b, v7.16b, #0*2 261 //ext v13.16b, v10.16b, v11.16b, #0*2 262 umlal v14.4s, v6.4h, v2.h[0] 263 umlal2 v15.4s, v6.8h, v2.h[0] 264 umlal v14.4s, v10.4h, v2.h[0] 265 umlal2 v15.4s, v10.8h, v2.h[0] 266 115: ext v12.16b, v6.16b, v7.16b, #1*2 267 ext v13.16b, v9.16b, v10.16b, #7*2 268 umlal v14.4s, v12.4h, v1.h[7] 269 umlal2 v15.4s, v12.8h, v1.h[7] 270 umlal v14.4s, v13.4h, v1.h[7] 271 umlal2 v15.4s, v13.8h, v1.h[7] 272 114: ext v12.16b, v6.16b, v7.16b, #2*2 273 ext v13.16b, v9.16b, v10.16b, #6*2 274 umlal v14.4s, v12.4h, v1.h[6] 275 umlal2 v15.4s, v12.8h, v1.h[6] 276 umlal v14.4s, v13.4h, v1.h[6] 277 umlal2 v15.4s, v13.8h, v1.h[6] 278 113: ext v12.16b, v6.16b, v7.16b, #3*2 279 ext v13.16b, v9.16b, v10.16b, #5*2 280 umlal v14.4s, v12.4h, v1.h[5] 281 umlal2 v15.4s, v12.8h, v1.h[5] 282 umlal v14.4s, v13.4h, v1.h[5] 283 umlal2 v15.4s, v13.8h, v1.h[5] 284 112: //ext v12.16b, v6.16b, v7.16b, #4*2 285 //ext v13.16b, v9.16b, v10.16b, #4*2 286 umlal2 v14.4s, v6.8h, v1.h[4] 287 umlal v15.4s, v7.4h, v1.h[4] 288 umlal2 v14.4s, v9.8h, v1.h[4] 289 umlal v15.4s, v10.4h, v1.h[4] 290 111: ext v12.16b, v6.16b, v7.16b, #5*2 291 ext v13.16b, v9.16b, v10.16b, #3*2 292 umlal v14.4s, v12.4h, v1.h[3] 293 umlal2 v15.4s, v12.8h, v1.h[3] 294 umlal v14.4s, v13.4h, v1.h[3] 295 umlal2 v15.4s, v13.8h, v1.h[3] 296 110: ext v12.16b, v6.16b, v7.16b, #6*2 297 ext v13.16b, v9.16b, v10.16b, #2*2 298 umlal v14.4s, v12.4h, v1.h[2] 299 umlal2 v15.4s, v12.8h, v1.h[2] 300 umlal v14.4s, v13.4h, v1.h[2] 301 umlal2 v15.4s, v13.8h, v1.h[2] 302 109: ext v12.16b, v6.16b, v7.16b, #7*2 303 ext v13.16b, v9.16b, v10.16b, #1*2 304 umlal v14.4s, v12.4h, v1.h[1] 305 umlal2 v15.4s, v12.8h, v1.h[1] 306 umlal v14.4s, v13.4h, v1.h[1] 307 umlal2 v15.4s, v13.8h, v1.h[1] 308 108: //ext v12.16b, v7.16b, v8.16b, #0*2 309 //ext v13.16b, v9.16b, v10.16b, #0*2 310 umlal v14.4s, v7.4h, v1.h[0] 311 umlal2 v15.4s, v7.8h, v1.h[0] 312 umlal v14.4s, v9.4h, v1.h[0] 313 umlal2 v15.4s, v9.8h, v1.h[0] 314 107: ext v12.16b, v7.16b, v8.16b, #1*2 315 ext v13.16b, v8.16b, v9.16b, #7*2 316 umlal v14.4s, v12.4h, v0.h[7] 317 umlal2 v15.4s, v12.8h, v0.h[7] 318 umlal v14.4s, v13.4h, v0.h[7] 319 umlal2 v15.4s, v13.8h, v0.h[7] 320 106: ext v12.16b, v7.16b, v8.16b, #2*2 321 ext v13.16b, v8.16b, v9.16b, #6*2 322 umlal v14.4s, v12.4h, v0.h[6] 323 umlal2 v15.4s, v12.8h, v0.h[6] 324 umlal v14.4s, v13.4h, v0.h[6] 325 umlal2 v15.4s, v13.8h, v0.h[6] 326 105: ext v12.16b, v7.16b, v8.16b, #3*2 327 ext v13.16b, v8.16b, v9.16b, #5*2 328 umlal v14.4s, v12.4h, v0.h[5] 329 umlal2 v15.4s, v12.8h, v0.h[5] 330 umlal v14.4s, v13.4h, v0.h[5] 331 umlal2 v15.4s, v13.8h, v0.h[5] 332 104: //ext v12.16b, v7.16b, v8.16b, #4*2 333 //ext v13.16b, v8.16b, v9.16b, #4*2 334 umlal2 v14.4s, v7.8h, v0.h[4] 335 umlal v15.4s, v8.4h, v0.h[4] 336 umlal2 v14.4s, v8.8h, v0.h[4] 337 umlal v15.4s, v9.4h, v0.h[4] 338 103: ext v12.16b, v7.16b, v8.16b, #5*2 339 ext v13.16b, v8.16b, v9.16b, #3*2 340 umlal v14.4s, v12.4h, v0.h[3] 341 umlal2 v15.4s, v12.8h, v0.h[3] 342 umlal v14.4s, v13.4h, v0.h[3] 343 umlal2 v15.4s, v13.8h, v0.h[3] 344 102: ext v12.16b, v7.16b, v8.16b, #6*2 345 ext v13.16b, v8.16b, v9.16b, #2*2 346 umlal v14.4s, v12.4h, v0.h[2] 347 umlal2 v15.4s, v12.8h, v0.h[2] 348 umlal v14.4s, v13.4h, v0.h[2] 349 umlal2 v15.4s, v13.8h, v0.h[2] 350 101: ext v12.16b, v7.16b, v8.16b, #7*2 351 ext v13.16b, v8.16b, v9.16b, #1*2 352 umlal v14.4s, v12.4h, v0.h[1] 353 umlal2 v15.4s, v12.8h, v0.h[1] 354 umlal v14.4s, v13.4h, v0.h[1] 355 umlal2 v15.4s, v13.8h, v0.h[1] 356 357 uqrshrn v14.4h, v14.4s, #16 358 uqrshrn2 v14.8h, v15.4s, #16 359 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 360 361 mov v6.16b, v7.16b 362 mov v7.16b, v8.16b 363 mov v8.16b, v9.16b 364 mov v9.16b, v10.16b 365 mov v10.16b, v11.16b 366.endm/*}}}*/ 367 368.macro hconv1_25/*{{{*/ 369 ext v12.16b, v6.16b, v7.16b, #7*2 370 umull v14.4s, v12.4h, v0.h[0] 371 umull2 v15.4s, v12.8h, v0.h[0] 372 373 adr x16, 100f 374 ldrsh x12, [x16, x5, LSL #1] 375 add x12, x12, x16 376 br x12 377 100: .hword -4 378 .hword 101f-100b 379 .hword 102f-100b 380 .hword 103f-100b 381 .hword 104f-100b 382 .hword 105f-100b 383 .hword 106f-100b 384 .hword 107f-100b 385 .hword 108f-100b 386 .hword 109f-100b 387 .hword 110f-100b 388 .hword 111f-100b 389 .hword 112f-100b 390 .hword 113f-100b 391 .hword 114f-100b 392 .hword 115f-100b 393 .hword 116f-100b 394 .hword 117f-100b 395 .hword 118f-100b 396 .hword 119f-100b 397 .hword 120f-100b 398 .hword 121f-100b 399 .hword 122f-100b 400 .hword 123f-100b 401 .hword 124f-100b 402 .hword 125f-100b 403 .align 4 404 125: ext v12.16b, v3.16b, v4.16b, #6*2 405 ext v13.16b, v10.16b, v11.16b, #0*2 406 umlal v14.4s, v12.4h, v3.h[1] 407 umlal2 v15.4s, v12.8h, v3.h[1] 408 umlal v14.4s, v13.4h, v3.h[1] 409 umlal2 v15.4s, v13.8h, v3.h[1] 410 124: ext v12.16b, v3.16b, v4.16b, #7*2 411 ext v13.16b, v9.16b, v10.16b, #7*2 412 umlal v14.4s, v12.4h, v3.h[0] 413 umlal2 v15.4s, v12.8h, v3.h[0] 414 umlal v14.4s, v13.4h, v3.h[0] 415 umlal2 v15.4s, v13.8h, v3.h[0] 416 123: ext v12.16b, v4.16b, v5.16b, #0*2 417 ext v13.16b, v9.16b, v10.16b, #6*2 418 umlal v14.4s, v12.4h, v2.h[7] 419 umlal2 v15.4s, v12.8h, v2.h[7] 420 umlal v14.4s, v13.4h, v2.h[7] 421 umlal2 v15.4s, v13.8h, v2.h[7] 422 122: ext v12.16b, v4.16b, v5.16b, #1*2 423 ext v13.16b, v9.16b, v10.16b, #5*2 424 umlal v14.4s, v12.4h, v2.h[6] 425 umlal2 v15.4s, v12.8h, v2.h[6] 426 umlal v14.4s, v13.4h, v2.h[6] 427 umlal2 v15.4s, v13.8h, v2.h[6] 428 121: ext v12.16b, v4.16b, v5.16b, #2*2 429 ext v13.16b, v9.16b, v10.16b, #4*2 430 umlal v14.4s, v12.4h, v2.h[5] 431 umlal2 v15.4s, v12.8h, v2.h[5] 432 umlal v14.4s, v13.4h, v2.h[5] 433 umlal2 v15.4s, v13.8h, v2.h[5] 434 120: ext v12.16b, v4.16b, v5.16b, #3*2 435 ext v13.16b, v9.16b, v10.16b, #3*2 436 umlal v14.4s, v12.4h, v2.h[4] 437 umlal2 v15.4s, v12.8h, v2.h[4] 438 umlal v14.4s, v13.4h, v2.h[4] 439 umlal2 v15.4s, v13.8h, v2.h[4] 440 119: ext v12.16b, v4.16b, v5.16b, #4*2 441 ext v13.16b, v9.16b, v10.16b, #2*2 442 umlal v14.4s, v12.4h, v2.h[3] 443 umlal2 v15.4s, v12.8h, v2.h[3] 444 umlal v14.4s, v13.4h, v2.h[3] 445 umlal2 v15.4s, v13.8h, v2.h[3] 446 118: ext v12.16b, v4.16b, v5.16b, #5*2 447 ext v13.16b, v9.16b, v10.16b, #1*2 448 umlal v14.4s, v12.4h, v2.h[2] 449 umlal2 v15.4s, v12.8h, v2.h[2] 450 umlal v14.4s, v13.4h, v2.h[2] 451 umlal2 v15.4s, v13.8h, v2.h[2] 452 117: ext v12.16b, v4.16b, v5.16b, #6*2 453 ext v13.16b, v9.16b, v10.16b, #0*2 454 umlal v14.4s, v12.4h, v2.h[1] 455 umlal2 v15.4s, v12.8h, v2.h[1] 456 umlal v14.4s, v13.4h, v2.h[1] 457 umlal2 v15.4s, v13.8h, v2.h[1] 458 116: ext v12.16b, v4.16b, v5.16b, #7*2 459 ext v13.16b, v8.16b, v9.16b, #7*2 460 umlal v14.4s, v12.4h, v2.h[0] 461 umlal2 v15.4s, v12.8h, v2.h[0] 462 umlal v14.4s, v13.4h, v2.h[0] 463 umlal2 v15.4s, v13.8h, v2.h[0] 464 115: ext v12.16b, v5.16b, v6.16b, #0*2 465 ext v13.16b, v8.16b, v9.16b, #6*2 466 umlal v14.4s, v12.4h, v1.h[7] 467 umlal2 v15.4s, v12.8h, v1.h[7] 468 umlal v14.4s, v13.4h, v1.h[7] 469 umlal2 v15.4s, v13.8h, v1.h[7] 470 114: ext v12.16b, v5.16b, v6.16b, #1*2 471 ext v13.16b, v8.16b, v9.16b, #5*2 472 umlal v14.4s, v12.4h, v1.h[6] 473 umlal2 v15.4s, v12.8h, v1.h[6] 474 umlal v14.4s, v13.4h, v1.h[6] 475 umlal2 v15.4s, v13.8h, v1.h[6] 476 113: ext v12.16b, v5.16b, v6.16b, #2*2 477 ext v13.16b, v8.16b, v9.16b, #4*2 478 umlal v14.4s, v12.4h, v1.h[5] 479 umlal2 v15.4s, v12.8h, v1.h[5] 480 umlal v14.4s, v13.4h, v1.h[5] 481 umlal2 v15.4s, v13.8h, v1.h[5] 482 112: ext v12.16b, v5.16b, v6.16b, #3*2 483 ext v13.16b, v8.16b, v9.16b, #3*2 484 umlal v14.4s, v12.4h, v1.h[4] 485 umlal2 v15.4s, v12.8h, v1.h[4] 486 umlal v14.4s, v13.4h, v1.h[4] 487 umlal2 v15.4s, v13.8h, v1.h[4] 488 111: ext v12.16b, v5.16b, v6.16b, #4*2 489 ext v13.16b, v8.16b, v9.16b, #2*2 490 umlal v14.4s, v12.4h, v1.h[3] 491 umlal2 v15.4s, v12.8h, v1.h[3] 492 umlal v14.4s, v13.4h, v1.h[3] 493 umlal2 v15.4s, v13.8h, v1.h[3] 494 110: ext v12.16b, v5.16b, v6.16b, #5*2 495 ext v13.16b, v8.16b, v9.16b, #1*2 496 umlal v14.4s, v12.4h, v1.h[2] 497 umlal2 v15.4s, v12.8h, v1.h[2] 498 umlal v14.4s, v13.4h, v1.h[2] 499 umlal2 v15.4s, v13.8h, v1.h[2] 500 109: ext v12.16b, v5.16b, v6.16b, #6*2 501 ext v13.16b, v8.16b, v9.16b, #0*2 502 umlal v14.4s, v12.4h, v1.h[1] 503 umlal2 v15.4s, v12.8h, v1.h[1] 504 umlal v14.4s, v13.4h, v1.h[1] 505 umlal2 v15.4s, v13.8h, v1.h[1] 506 108: ext v12.16b, v5.16b, v6.16b, #7*2 507 ext v13.16b, v7.16b, v8.16b, #7*2 508 umlal v14.4s, v12.4h, v1.h[0] 509 umlal2 v15.4s, v12.8h, v1.h[0] 510 umlal v14.4s, v13.4h, v1.h[0] 511 umlal2 v15.4s, v13.8h, v1.h[0] 512 107: ext v12.16b, v6.16b, v7.16b, #0*2 513 ext v13.16b, v7.16b, v8.16b, #6*2 514 umlal v14.4s, v12.4h, v0.h[7] 515 umlal2 v15.4s, v12.8h, v0.h[7] 516 umlal v14.4s, v13.4h, v0.h[7] 517 umlal2 v15.4s, v13.8h, v0.h[7] 518 106: ext v12.16b, v6.16b, v7.16b, #1*2 519 ext v13.16b, v7.16b, v8.16b, #5*2 520 umlal v14.4s, v12.4h, v0.h[6] 521 umlal2 v15.4s, v12.8h, v0.h[6] 522 umlal v14.4s, v13.4h, v0.h[6] 523 umlal2 v15.4s, v13.8h, v0.h[6] 524 105: ext v12.16b, v6.16b, v7.16b, #2*2 525 ext v13.16b, v7.16b, v8.16b, #4*2 526 umlal v14.4s, v12.4h, v0.h[5] 527 umlal2 v15.4s, v12.8h, v0.h[5] 528 umlal v14.4s, v13.4h, v0.h[5] 529 umlal2 v15.4s, v13.8h, v0.h[5] 530 104: ext v12.16b, v6.16b, v7.16b, #3*2 531 ext v13.16b, v7.16b, v8.16b, #3*2 532 umlal v14.4s, v12.4h, v0.h[4] 533 umlal2 v15.4s, v12.8h, v0.h[4] 534 umlal v14.4s, v13.4h, v0.h[4] 535 umlal2 v15.4s, v13.8h, v0.h[4] 536 103: ext v12.16b, v6.16b, v7.16b, #4*2 537 ext v13.16b, v7.16b, v8.16b, #2*2 538 umlal v14.4s, v12.4h, v0.h[3] 539 umlal2 v15.4s, v12.8h, v0.h[3] 540 umlal v14.4s, v13.4h, v0.h[3] 541 umlal2 v15.4s, v13.8h, v0.h[3] 542 102: ext v12.16b, v6.16b, v7.16b, #5*2 543 ext v13.16b, v7.16b, v8.16b, #1*2 544 umlal v14.4s, v12.4h, v0.h[2] 545 umlal2 v15.4s, v12.8h, v0.h[2] 546 umlal v14.4s, v13.4h, v0.h[2] 547 umlal2 v15.4s, v13.8h, v0.h[2] 548 101: ext v12.16b, v6.16b, v7.16b, #6*2 549 ext v13.16b, v7.16b, v8.16b, #0*2 550 umlal v14.4s, v12.4h, v0.h[1] 551 umlal2 v15.4s, v12.8h, v0.h[1] 552 umlal v14.4s, v13.4h, v0.h[1] 553 umlal2 v15.4s, v13.8h, v0.h[1] 554 555 uqrshrn v14.4h, v14.4s, #16 556 uqrshrn2 v14.8h, v15.4s, #16 557 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 558 559 ins v3.d[1], v4.d[0] 560 mov v4.16b, v5.16b 561 mov v5.16b, v6.16b 562 mov v6.16b, v7.16b 563 mov v7.16b, v8.16b 564 mov v8.16b, v9.16b 565 mov v9.16b, v10.16b 566 mov v10.16b, v11.16b 567.endm/*}}}*/ 568 569#define TUNED_LIST4 6, 12 570.macro hconv4_6/*{{{*/ 571 umull v14.4s, v7.4h, v0.h[0] 572 umull2 v15.4s, v7.8h, v0.h[0] 573 574 adr x16, 100f 575 ldrsh x12, [x16, x5, LSL #1] 576 add x12, x12, x16 577 br x12 578 100: .hword -4 579 .hword 101f-100b 580 .hword 102f-100b 581 .hword 103f-100b 582 .hword 104f-100b 583 .hword 105f-100b 584 .hword 106f-100b 585 .align 4 586 106: umlal v14.4s, v4.4h, v0.h[6] 587 umlal2 v15.4s, v4.8h, v0.h[6] 588 umlal v14.4s, v10.4h, v0.h[6] 589 umlal2 v15.4s, v10.8h, v0.h[6] 590 105: umlal2 v14.4s, v4.8h, v0.h[5] 591 umlal v15.4s, v5.4h, v0.h[5] 592 umlal2 v14.4s, v9.8h, v0.h[5] 593 umlal v15.4s, v10.4h, v0.h[5] 594 104: umlal v14.4s, v5.4h, v0.h[4] 595 umlal2 v15.4s, v5.8h, v0.h[4] 596 umlal v14.4s, v9.4h, v0.h[4] 597 umlal2 v15.4s, v9.8h, v0.h[4] 598 103: umlal2 v14.4s, v5.8h, v0.h[3] 599 umlal v15.4s, v6.4h, v0.h[3] 600 umlal2 v14.4s, v8.8h, v0.h[3] 601 umlal v15.4s, v9.4h, v0.h[3] 602 102: umlal v14.4s, v6.4h, v0.h[2] 603 umlal2 v15.4s, v6.8h, v0.h[2] 604 umlal v14.4s, v8.4h, v0.h[2] 605 umlal2 v15.4s, v8.8h, v0.h[2] 606 101: umlal2 v14.4s, v6.8h, v0.h[1] 607 umlal v15.4s, v7.4h, v0.h[1] 608 umlal2 v14.4s, v7.8h, v0.h[1] 609 umlal v15.4s, v8.4h, v0.h[1] 610 611 uqrshrn v14.4h, v14.4s, #16 612 uqrshrn2 v14.8h, v15.4s, #16 613 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 614 615 mov v4.16b, v5.16b 616 mov v5.16b, v6.16b 617 mov v6.16b, v7.16b 618 mov v7.16b, v8.16b 619 mov v8.16b, v9.16b 620 mov v9.16b, v10.16b 621 mov v10.16b, v11.16b 622.endm/*}}}*/ 623 624.macro hconv4_12/*{{{*/ 625 umull v14.4s, v4.4h, v0.h[0] 626 umull2 v15.4s, v4.8h, v0.h[0] 627 628 adr x16, 100f 629 ldrsh x12, [x16, x5, LSL #1] 630 add x12, x12, x16 631 br x12 632 100: .hword -4 633 .hword 101f-100b 634 .hword 102f-100b 635 .hword 103f-100b 636 .hword 104f-100b 637 .hword 105f-100b 638 .hword 106f-100b 639 .hword 107f-100b 640 .hword 108f-100b 641 .hword 109f-100b 642 .hword 110f-100b 643 .hword 111f-100b 644 .hword 112f-100b 645 .align 4 646 112: add x12, x9, #0x1a0 647 bic x12, x12, #0x200 648 ld1 {v12.8h}, [x12] 649 umlal v14.4s, v12.4h, v1.h[4] 650 umlal2 v15.4s, v12.8h, v1.h[4] 651 umlal v14.4s, v10.4h, v1.h[4] 652 umlal2 v15.4s, v10.8h, v1.h[4] 653 111: add x12, x9, #0x1a8 654 bic x12, x12, #0x200 655 ld1 {v12.4h}, [x12], #8 656 bic x12, x12, #0x200 657 ld1 {v13.4h}, [x12] 658 umlal v14.4s, v12.4h, v1.h[3] 659 umlal v15.4s, v13.4h, v1.h[3] 660 umlal2 v14.4s, v9.8h, v1.h[3] 661 umlal v15.4s, v10.4h, v1.h[3] 662 110: add x12, x9, #0x1b0 663 bic x12, x12, #0x200 664 ld1 {v12.8h}, [x12] 665 umlal v14.4s, v12.4h, v1.h[2] 666 umlal2 v15.4s, v12.8h, v1.h[2] 667 umlal v14.4s, v9.4h, v1.h[2] 668 umlal2 v15.4s, v9.8h, v1.h[2] 669 109: add x12, x9, #0x1b8 670 bic x12, x12, #0x200 671 ld1 {v12.4h}, [x12], #8 672 bic x12, x12, #0x200 673 ld1 {v13.4h}, [x12] 674 umlal v14.4s, v12.4h, v1.h[1] 675 umlal v15.4s, v13.4h, v1.h[1] 676 umlal2 v14.4s, v8.8h, v1.h[1] 677 umlal v15.4s, v9.4h, v1.h[1] 678 108: add x12, x9, #0x1c0 679 bic x12, x12, #0x200 680 ld1 {v12.8h}, [x12] 681 umlal v14.4s, v12.4h, v1.h[0] 682 umlal2 v15.4s, v12.8h, v1.h[0] 683 umlal v14.4s, v8.4h, v1.h[0] 684 umlal2 v15.4s, v8.8h, v1.h[0] 685 107: add x12, x9, #0x1c8 686 bic x12, x12, #0x200 687 ld1 {v12.4h}, [x12], #8 688 bic x12, x12, #0x200 689 ld1 {v13.4h}, [x12] 690 umlal v14.4s, v12.4h, v0.h[7] 691 umlal v15.4s, v13.4h, v0.h[7] 692 umlal2 v14.4s, v7.8h, v0.h[7] 693 umlal v15.4s, v8.4h, v0.h[7] 694 106: add x12, x9, #0x1d0 695 bic x12, x12, #0x200 696 ld1 {v12.8h}, [x12] 697 umlal v14.4s, v12.4h, v0.h[6] 698 umlal2 v15.4s, v12.8h, v0.h[6] 699 umlal v14.4s, v7.4h, v0.h[6] 700 umlal2 v15.4s, v7.8h, v0.h[6] 701 105: add x12, x9, #0x1d8 702 bic x12, x12, #0x200 703 ld1 {v12.4h}, [x12], #8 704 bic x12, x12, #0x200 705 ld1 {v13.4h}, [x12] 706 umlal v14.4s, v12.4h, v0.h[5] 707 umlal v15.4s, v13.4h, v0.h[5] 708 umlal2 v14.4s, v6.8h, v0.h[5] 709 umlal v15.4s, v7.4h, v0.h[5] 710 104: add x12, x9, #0x1e0 711 bic x12, x12, #0x200 712 ld1 {v12.8h}, [x12] 713 umlal v14.4s, v12.4h, v0.h[4] 714 umlal2 v15.4s, v12.8h, v0.h[4] 715 umlal v14.4s, v6.4h, v0.h[4] 716 umlal2 v15.4s, v6.8h, v0.h[4] 717 103: add x12, x9, #0x1e8 718 bic x12, x12, #0x200 719 ld1 {v12.4h}, [x12], #8 720 bic x12, x12, #0x200 721 ld1 {v13.4h}, [x12] 722 umlal v14.4s, v12.4h, v0.h[3] 723 umlal v15.4s, v13.4h, v0.h[3] 724 umlal2 v14.4s, v5.8h, v0.h[3] 725 umlal v15.4s, v6.4h, v0.h[3] 726 102: add x12, x9, #0x1f0 727 bic x12, x12, #0x200 728 ld1 {v12.8h}, [x12] 729 umlal v14.4s, v12.4h, v0.h[2] 730 umlal2 v15.4s, v12.8h, v0.h[2] 731 umlal v14.4s, v5.4h, v0.h[2] 732 umlal2 v15.4s, v5.8h, v0.h[2] 733 101: add x12, x9, #0x1f8 734 bic x12, x12, #0x200 735 ld1 {v12.4h}, [x12] 736 umlal v14.4s, v12.4h, v0.h[1] 737 umlal v15.4s, v4.4h, v0.h[1] 738 umlal2 v14.4s, v4.8h, v0.h[1] 739 umlal v15.4s, v5.4h, v0.h[1] 740 741 uqrshrn v14.4h, v14.4s, #16 742 uqrshrn2 v14.8h, v15.4s, #16 743 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 744 745 st1 {v4.16b}, [x9], #16 746 bic x9, x9, #0x200 747 mov v4.16b, v5.16b 748 mov v5.16b, v6.16b 749 mov v6.16b, v7.16b 750 mov v7.16b, v8.16b 751 mov v8.16b, v9.16b 752 mov v9.16b, v10.16b 753 mov v10.16b, v11.16b 754.endm/*}}}*/ 755 756.macro hconv4_25/*{{{*/ 757 add x12, x9, #0x198 758 bic x12, x12, #0x200 759 ld1 {v12.4h}, [x12], #8 760 bic x12, x12, #0x200 761 ld1 {v13.4h}, [x12] 762 umull v14.4s, v12.4h, v0.h[0] 763 umull v15.4s, v13.4h, v0.h[0] 764 765 adr x16, 100f 766 ldrsh x12, [x16, x5, LSL #1] 767 add x12, x12, x16 768 br x12 769 100: .hword -4 770 .hword 101f-100b 771 .hword 102f-100b 772 .hword 103f-100b 773 .hword 104f-100b 774 .hword 105f-100b 775 .hword 106f-100b 776 .hword 107f-100b 777 .hword 108f-100b 778 .hword 109f-100b 779 .hword 110f-100b 780 .hword 111f-100b 781 .hword 112f-100b 782 .hword 113f-100b 783 .hword 114f-100b 784 .hword 115f-100b 785 .hword 116f-100b 786 .hword 117f-100b 787 .hword 118f-100b 788 .hword 119f-100b 789 .hword 120f-100b 790 .hword 121f-100b 791 .hword 122f-100b 792 .hword 123f-100b 793 .hword 124f-100b 794 .hword 125f-100b 795 .align 4 796 125: add x12, x9, #0x0d0 797 bic x12, x12, #0x200 798 ld1 {v12.8h}, [x12] 799 umlal v14.4s, v12.4h, v3.h[1] 800 umlal2 v15.4s, v12.8h, v3.h[1] 801 umlal v14.4s, v10.4h, v3.h[1] 802 umlal2 v15.4s, v10.8h, v3.h[1] 803 124: add x12, x9, #0x0d8 804 bic x12, x12, #0x200 805 ld1 {v12.4h}, [x12], #8 806 bic x12, x12, #0x200 807 ld1 {v13.4h}, [x12] 808 umlal v14.4s, v12.4h, v3.h[0] 809 umlal v15.4s, v13.4h, v3.h[0] 810 umlal2 v14.4s, v9.8h, v3.h[0] 811 umlal v15.4s, v10.4h, v3.h[0] 812 123: add x12, x9, #0x0e0 813 bic x12, x12, #0x200 814 ld1 {v12.8h}, [x12] 815 umlal v14.4s, v12.4h, v2.h[7] 816 umlal2 v15.4s, v12.8h, v2.h[7] 817 umlal v14.4s, v9.4h, v2.h[7] 818 umlal2 v15.4s, v9.8h, v2.h[7] 819 122: add x12, x9, #0x0e8 820 bic x12, x12, #0x200 821 ld1 {v12.4h}, [x12], #8 822 bic x12, x12, #0x200 823 ld1 {v13.4h}, [x12] 824 umlal v14.4s, v12.4h, v2.h[6] 825 umlal v15.4s, v13.4h, v2.h[6] 826 umlal2 v14.4s, v8.8h, v2.h[6] 827 umlal v15.4s, v9.4h, v2.h[6] 828 121: add x12, x9, #0x0f0 829 bic x12, x12, #0x200 830 ld1 {v12.8h}, [x12] 831 umlal v14.4s, v12.4h, v2.h[5] 832 umlal2 v15.4s, v12.8h, v2.h[5] 833 umlal v14.4s, v8.4h, v2.h[5] 834 umlal2 v15.4s, v8.8h, v2.h[5] 835 120: add x12, x9, #0x0f8 836 bic x12, x12, #0x200 837 ld1 {v12.4h}, [x12], #8 838 bic x12, x12, #0x200 839 ld1 {v13.4h}, [x12] 840 umlal v14.4s, v12.4h, v2.h[4] 841 umlal v15.4s, v13.4h, v2.h[4] 842 umlal2 v14.4s, v7.8h, v2.h[4] 843 umlal v15.4s, v8.4h, v2.h[4] 844 119: add x12, x9, #0x100 845 bic x12, x12, #0x200 846 ld1 {v12.8h}, [x12] 847 umlal v14.4s, v12.4h, v2.h[3] 848 umlal2 v15.4s, v12.8h, v2.h[3] 849 umlal v14.4s, v7.4h, v2.h[3] 850 umlal2 v15.4s, v7.8h, v2.h[3] 851 118: add x12, x9, #0x108 852 bic x12, x12, #0x200 853 ld1 {v12.4h}, [x12], #8 854 bic x12, x12, #0x200 855 ld1 {v13.4h}, [x12] 856 umlal v14.4s, v12.4h, v2.h[2] 857 umlal v15.4s, v13.4h, v2.h[2] 858 umlal2 v14.4s, v6.8h, v2.h[2] 859 umlal v15.4s, v7.4h, v2.h[2] 860 117: add x12, x9, #0x110 861 bic x12, x12, #0x200 862 ld1 {v12.8h}, [x12] 863 umlal v14.4s, v12.4h, v2.h[1] 864 umlal2 v15.4s, v12.8h, v2.h[1] 865 umlal v14.4s, v6.4h, v2.h[1] 866 umlal2 v15.4s, v6.8h, v2.h[1] 867 116: add x12, x9, #0x118 868 bic x12, x12, #0x200 869 ld1 {v12.4h}, [x12], #8 870 bic x12, x12, #0x200 871 ld1 {v13.4h}, [x12] 872 umlal v14.4s, v12.4h, v2.h[0] 873 umlal v15.4s, v13.4h, v2.h[0] 874 umlal2 v14.4s, v5.8h, v2.h[0] 875 umlal v15.4s, v6.4h, v2.h[0] 876 115: add x12, x9, #0x120 877 bic x12, x12, #0x200 878 ld1 {v12.8h}, [x12] 879 umlal v14.4s, v12.4h, v1.h[7] 880 umlal2 v15.4s, v12.8h, v1.h[7] 881 umlal v14.4s, v5.4h, v1.h[7] 882 umlal2 v15.4s, v5.8h, v1.h[7] 883 114: add x12, x9, #0x128 884 bic x12, x12, #0x200 885 ld1 {v12.4h}, [x12], #8 886 bic x12, x12, #0x200 887 ld1 {v13.4h}, [x12] 888 umlal v14.4s, v12.4h, v1.h[6] 889 umlal v15.4s, v13.4h, v1.h[6] 890 umlal2 v14.4s, v4.8h, v1.h[6] 891 umlal v15.4s, v5.4h, v1.h[6] 892 113: add x12, x9, #0x130 893 bic x12, x12, #0x200 894 ld1 {v12.8h}, [x12] 895 umlal v14.4s, v12.4h, v1.h[5] 896 umlal2 v15.4s, v12.8h, v1.h[5] 897 umlal v14.4s, v4.4h, v1.h[5] 898 umlal2 v15.4s, v4.8h, v1.h[5] 899 112: add x12, x9, #0x138 900 bic x12, x12, #0x200 901 ld1 {v12.4h}, [x12], #8 902 bic x12, x12, #0x200 903 ld1 {v16.4h}, [x12] 904 add x12, x9, #0x1f8 905 bic x12, x12, #0x200 906 ld1 {v13.4h}, [x12] 907 umlal v14.4s, v12.4h, v1.h[4] 908 umlal v15.4s, v16.4h, v1.h[4] 909 umlal v14.4s, v13.4h, v1.h[4] // Could be d7, without the load, right? 910 umlal v15.4s, v4.4h, v1.h[4] 911 111: add x12, x9, #0x140 912 bic x12, x12, #0x200 913 ld1 {v12.8h}, [x12] 914 add x12, x9, #0x1f0 915 bic x12, x12, #0x200 916 ld1 {v13.8h}, [x12] 917 umlal v14.4s, v12.4h, v1.h[3] 918 umlal2 v15.4s, v12.8h, v1.h[3] 919 umlal v14.4s, v13.4h, v1.h[3] 920 umlal2 v15.4s, v13.8h, v1.h[3] 921 110: add x12, x9, #0x148 922 bic x12, x12, #0x200 923 ld1 {v12.4h}, [x12], #8 924 bic x12, x12, #0x200 925 ld1 {v16.4h}, [x12] 926 add x12, x9, #0x1e8 927 bic x12, x12, #0x200 928 ld1 {v13.4h}, [x12], #8 929 bic x12, x12, #0x200 930 ld1 {v17.4h}, [x12] 931 umlal v14.4s, v12.4h, v1.h[2] 932 umlal v15.4s, v16.4h, v1.h[2] 933 umlal v14.4s, v13.4h, v1.h[2] 934 umlal v15.4s, v17.4h, v1.h[2] 935 109: add x12, x9, #0x150 936 bic x12, x12, #0x200 937 ld1 {v12.8h}, [x12] 938 add x12, x9, #0x1e0 939 bic x12, x12, #0x200 940 ld1 {v13.8h}, [x12] 941 umlal v14.4s, v12.4h, v1.h[1] 942 umlal2 v15.4s, v12.8h, v1.h[1] 943 umlal v14.4s, v13.4h, v1.h[1] 944 umlal2 v15.4s, v13.8h, v1.h[1] 945 108: add x12, x9, #0x158 946 bic x12, x12, #0x200 947 ld1 {v12.4h}, [x12], #8 948 bic x12, x12, #0x200 949 ld1 {v16.4h}, [x12] 950 add x12, x9, #0x1d8 951 bic x12, x12, #0x200 952 ld1 {v13.4h}, [x12], #8 953 bic x12, x12, #0x200 954 ld1 {v17.4h}, [x12] 955 umlal v14.4s, v12.4h, v1.h[0] 956 umlal v15.4s, v16.4h, v1.h[0] 957 umlal v14.4s, v13.4h, v1.h[0] 958 umlal v15.4s, v17.4h, v1.h[0] 959 107: add x12, x9, #0x160 960 bic x12, x12, #0x200 961 ld1 {v12.8h}, [x12] 962 add x12, x9, #0x1d0 963 bic x12, x12, #0x200 964 ld1 {v13.8h}, [x12] 965 umlal v14.4s, v12.4h, v0.h[7] 966 umlal2 v15.4s, v12.8h, v0.h[7] 967 umlal v14.4s, v13.4h, v0.h[7] 968 umlal2 v15.4s, v13.8h, v0.h[7] 969 106: add x12, x9, #0x168 970 bic x12, x12, #0x200 971 ld1 {v12.4h}, [x12], #8 972 bic x12, x12, #0x200 973 ld1 {v16.4h}, [x12] 974 add x12, x9, #0x1c8 975 bic x12, x12, #0x200 976 ld1 {v13.4h}, [x12], #8 977 bic x12, x12, #0x200 978 ld1 {v17.4h}, [x12] 979 umlal v14.4s, v12.4h, v0.h[6] 980 umlal v15.4s, v16.4h, v0.h[6] 981 umlal v14.4s, v13.4h, v0.h[6] 982 umlal v15.4s, v17.4h, v0.h[6] 983 105: add x12, x9, #0x170 984 bic x12, x12, #0x200 985 ld1 {v12.8h}, [x12] 986 add x12, x9, #0x1c0 987 bic x12, x12, #0x200 988 ld1 {v13.8h}, [x12] 989 umlal v14.4s, v12.4h, v0.h[5] 990 umlal2 v15.4s, v12.8h, v0.h[5] 991 umlal v14.4s, v13.4h, v0.h[5] 992 umlal2 v15.4s, v13.8h, v0.h[5] 993 104: add x12, x9, #0x178 994 bic x12, x12, #0x200 995 ld1 {v12.4h}, [x12], #8 996 bic x12, x12, #0x200 997 ld1 {v16.4h}, [x12] 998 add x12, x9, #0x1b8 999 bic x12, x12, #0x200 1000 ld1 {v13.4h}, [x12], #8 1001 bic x12, x12, #0x200 1002 ld1 {v17.4h}, [x12] 1003 umlal v14.4s, v12.4h, v0.h[4] 1004 umlal v15.4s, v16.4h, v0.h[4] 1005 umlal v14.4s, v13.4h, v0.h[4] 1006 umlal v15.4s, v17.4h, v0.h[4] 1007 103: add x12, x9, #0x180 1008 bic x12, x12, #0x200 1009 ld1 {v12.8h}, [x12] 1010 add x12, x9, #0x1b0 1011 bic x12, x12, #0x200 1012 ld1 {v13.8h}, [x12] 1013 umlal v14.4s, v12.4h, v0.h[3] 1014 umlal2 v15.4s, v12.8h, v0.h[3] 1015 umlal v14.4s, v13.4h, v0.h[3] 1016 umlal2 v15.4s, v13.8h, v0.h[3] 1017 102: add x12, x9, #0x188 1018 bic x12, x12, #0x200 1019 ld1 {v12.4h}, [x12], #8 1020 bic x12, x12, #0x200 1021 ld1 {v16.4h}, [x12] 1022 add x12, x9, #0x1a8 1023 bic x12, x12, #0x200 1024 ld1 {v13.4h}, [x12], #8 1025 bic x12, x12, #0x200 1026 ld1 {v17.4h}, [x12] 1027 umlal v14.4s, v12.4h, v0.h[2] 1028 umlal v15.4s, v16.4h, v0.h[2] 1029 umlal v14.4s, v13.4h, v0.h[2] 1030 umlal v15.4s, v17.4h, v0.h[2] 1031 101: add x12, x9, #0x190 1032 bic x12, x12, #0x200 1033 ld1 {v12.8h}, [x12], #16 1034 bic x12, x12, #0x200 1035 ld1 {v13.8h}, [x12] 1036 umlal v14.4s, v12.4h, v0.h[1] 1037 umlal2 v15.4s, v12.8h, v0.h[1] 1038 umlal v14.4s, v13.4h, v0.h[1] 1039 umlal2 v15.4s, v13.8h, v0.h[1] 1040 1041 uqrshrn v14.4h, v14.4s, #16 1042 uqrshrn2 v14.8h, v15.4s, #16 1043 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 1044 1045 st1 {v4.16b}, [x9], #16 1046 bic x9, x9, #0x200 1047 mov v4.16b, v5.16b 1048 mov v5.16b, v6.16b 1049 mov v6.16b, v7.16b 1050 mov v7.16b, v8.16b 1051 mov v8.16b, v9.16b 1052 mov v9.16b, v10.16b 1053 mov v10.16b, v11.16b 1054.endm/*}}}*/ 1055 1056/* Dedicated function wrapper for the fetch macro, for the cases where 1057 * performance isn't that important, to keep code size down. 1058 */ 1059PRIVATE(fetch_generic_asm) 1060 stp x10, x11, [sp, #-16]! 1061 fetch 1062 ldp x10, x11, [sp], #16 1063 ret 1064END(fetch_generic_asm) 1065 1066/* Given values in q10 and q11, and an index in x11, sweep the (x11&15)th value 1067 * across to fill the rest of the register pair. Used for filling the right 1068 * hand edge of the window when starting too close to the right hand edge of 1069 * the image. 1070 */ 1071PRIVATE(prefetch_clamp1) 1072 sub x11, xzr, x11 1073 sub x15, x15, x1 1074 sub x19, x19, x1 1075 tbz x11, #3, 1f 1076 mov v11.16b, v10.16b 1077 sub x1, x1, #16 10781: mov v12.16b, v11.16b 1079 movi v13.8b, #0xff 1080 tbz x11, #2, 1f 1081 ext v12.16b, v12.16b, v12.16b, #4*2 1082 sub x1, x1, #8 1083 shl v13.2d, v13.2d, #32 10841: tbz x11, #1, 1f 1085 ext v12.16b, v12.16b, v12.16b, #6*2 1086 sub x1, x1, #4 1087 shl v13.2d, v13.2d, #16 10881: tbz x11, #0, 1f 1089 ext v12.16b, v12.16b, v12.16b, #7*2 1090 sub x1, x1, #2 1091 shl v13.2d, v13.2d, #8 10921: dup v12.8h, v12.h[6] 1093 sxtl v13.8h, v13.8b 1094 bif v11.16b, v12.16b, v13.16b 10951: tbz x11, #3, 1f 1096 mov v10.16b, v11.16b 1097 mov v11.16b, v12.16b 10981: sub x11, xzr, x11 1099 add x15, x15, x1 1100 add x19, x19, x1 1101 ret 1102END(prefetch_clamp1) 1103 1104PRIVATE(prefetch_clamp4) 1105 sub x11, xzr, x11 1106 sub x15, x15, x1 1107 sub x19, x19, x1 1108 tbz x11, #3, 1f 1109 sub x1, x1, #16 // what's this? 1110 mov v11.16b, v10.16b 11111: dup v12.2d, v11.d[1] 1112 tbz x11, #2, 1f 1113 dup v12.2d, v11.d[0] 1114 sub x1, x1, #8 1115 dup v11.2d, v11.d[0] 11161: tbz x11, #3, 1f 1117 mov v10.16b, v11.16b 1118 mov v11.16b, v12.16b 11191: sub x11, xzr, x11 1120 add x15, x15, x1 1121 add x19, x19, x1 1122 ret 1123END(prefetch_clamp4) 1124 1125 1126/* Helpers for prefetch, below. 1127 */ 1128.macro prefetch_out qa, qb, store, qsa, qsb, qsb_hi 1129 .if \store > 0 1130 .ifc \qsa,\qsb 1131 st1 {\qsa}, [x9], #16 1132 st1 {\qsb}, [x9], #16 1133 .else 1134 st1 {\qsa,\qsb}, [x9], #32 1135 .endif 1136 .elseif \store == 0 1137 mov \qa, \qsa 1138 mov \qb, \qsb 1139 .else 1140 ins \qb, \qsb_hi 1141 .endif 1142.endm 1143 1144.macro prefetch_one qa, qb, rem, c, store=0, step=1 1145.set i, (need - 16) - \rem 1146.if i >= 0 11471: cmp x10, #i+16 1148 blo 2f 1149 prefetch_out \qa, \qb, \store, v9.16b, v9.16b, v9.d[1] 1150 b 1f 11512: cmp x11, #i+16 1152 bls 3f 1153 prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1] 1154 bl fetch_generic_asm 1155 b 2f 11563: bl prefetch_clamp\step 1157 prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1] 11584: b 4f+4 1159 //v12 contains pad word from prefetch_clamp call 1160 prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1] 1161 .if \rem > 0 1162 b 4f+4 1163 .else 11641: 11652: 11663: 11674: nop 1168 .endif 1169.endif 1170.endm 1171 1172/* Fill the convolution window with context data. The aim here is to load 1173 * exactly rlf + rrt columns, and in the main loop to read as many columns as 1174 * will be written. This is complicated by the need to handle cases when the 1175 * input starts very close to the left or right (or both) edges of the image, 1176 * and where these do not fall on 16-byte boundaries. 1177 * 1178 * Input: 1179 * x1 -- src 1180 * x2 -- pitch 1181 * x3 -- count 1182 * x4 -- inlen 1183 * x5 -- r 1184 * x6 -- rup 1185 * x7 -- rdn 1186 * x8 -- rlf 1187 * x9 -- buffer (if needed) 1188 * x13 = -pitch 1189 * x15 = top-row in 1190 * x19 = bottom-row in 1191 * Output: 1192 * x1 += rlf + min(count, rrt) 1193 * Modifies: 1194 * x10 -- fill start index in the window 1195 * x11 -- fill stop index in the window 1196 * x12 -- scratch 1197 */ 1198.macro prefetch step=1, max_r=25 1199.set need, ((\max_r + \max_r) * \step + 15) & ~15 1200 .if \step == 1 1201 mov x10, #need - (\max_r * \step) 1202 sub x10, x10, x8 1203 .else 1204 mov x10, #need - (\max_r * \step) 1205 sub x10, x10, x8, LSL #2 1206 .endif 1207 add x11, x10, x4 1208 subs x11, x11, #need 1209 csel x11, xzr, x11, hi 1210 add x11, x11, #need 1211 1212 bl fetch_generic_asm 1213 .if \step == 1 1214 dup v9.8h, v10.h[0] 1215 .else 1216 dup v9.2d, v10.d[0] 1217 .endif 1218 tst x10, #15 1219 beq 2f 1220 sub x12, xzr, x10 1221 tbz x10, #3, 1f 1222 mov v11.16b, v10.16b 1223 mov v10.16b, v9.16b 12241: tbz x12, #2, 1f 1225 ext v11.16b, v10.16b, v11.16b, #4*2 1226 ext v10.16b, v9.16b, v10.16b, #4*2 1227 .if \step == 1 1228 1: tbz x12, #1, 1f 1229 ext v11.16b, v10.16b, v11.16b, #2*2 1230 ext v10.16b, v9.16b, v10.16b, #2*2 1231 1: tbz x12, #0, 1f 1232 ext v11.16b, v10.16b, v11.16b, #1*2 1233 ext v10.16b, v9.16b, v10.16b, #1*2 1234 .endif 12351: sub x1, x1, x10 1236 sub x15, x15, x10 1237 sub x19, x19, x10 1238 bic x10, x10, #15 1239 add x1, x1, x10 1240 add x15, x15, x10 1241 add x19, x19, x10 12422: 1243 .if \step > 1 1244 /* it's only in the uchar2 and uchar4 cases where the register file 1245 * is insufficient (given MAX_R <= 25). 1246 */ 1247 prefetch_one xx, xx, 192, c=\max_r, step=\step, store=1 1248 prefetch_one xx, xx, 176, c=\max_r, step=\step, store=1 1249 prefetch_one xx, xx, 160, c=\max_r, step=\step, store=1 1250 prefetch_one xx, xx, 144, c=\max_r, step=\step, store=1 1251 prefetch_one xx, xx, 128, c=\max_r, step=\step, store=1 1252 prefetch_one xx, xx, 112, c=\max_r, step=\step, store=1 1253 prefetch_one xx, xx, 96, c=\max_r, step=\step, store=1 1254 prefetch_one xx, xx, 80, c=\max_r, step=\step, store=1 1255 prefetch_one xx, xx, 64, c=\max_r, step=\step, store=1 1256 prefetch_one xx, xx, 48, c=\max_r, step=\step, store=1 1257 .else 1258 /* q3 normally contains the coefficient table, but it's not fully 1259 * used. In the uchar1, r=25 case the other half of q3 is used for 1260 * the last two window taps to avoid falling out to memory. 1261 */ 1262 prefetch_one xx, v3.d[1], 48, c=\max_r, step=\step, store=-1 1263 .endif 1264 prefetch_one v4.16b, v5.16b, 32, c=\max_r, step=\step, store=0 1265 prefetch_one v6.16b, v7.16b, 16, c=\max_r, step=\step, store=0 1266 prefetch_one v8.16b, v9.16b, 0, c=\max_r, step=\step, store=0 1267 1268 .if \step == 1 1269 add x10, x8, #\max_r * \step 1270 .else 1271 lsl x10, x8, #2 1272 add x10, x10, #\max_r * \step 1273 .endif 1274 subs x4, x4, x10 1275 csel x4, xzr, x4, lo 1276.endm 1277 1278/* The main loop. 1279 * 1280 * Input: 1281 * x0 = dst 1282 * x1 = src 1283 * x2 = pitch 1284 * x3 = count 1285 * x4 = inlen 1286 * x5 = r 1287 * x6 = rup 1288 * x7 = rdn 1289 * x9 = buffer 1290 * x13 = -pitch 1291 * x15 = top-row in 1292 * x19 = bottom-row in 1293 * Modifies 1294 * x8 = fetch code pointer 1295 */ 1296.macro mainloop core, step=1, max_r=25, labelc="", labelnc="" 1297 adrp x8, \labelnc 1298 add x8, x8, #:lo12:\labelnc 1299 sub x8, x8, x5, LSL #5 1300 sub x8, x8, x5, LSL #3 1301 cmp x5, x6 1302 ccmp x5, x7, #0, eq 1303 beq 5f 1304 1305 /* if (r != rup || r != rdn) then the address-clamping table should 1306 * be used rather than the short-cut version. 1307 */ 1308 adrp x8, \labelc 1309 add x8, x8, #:lo12:\labelc 1310 sub x8, x8, x5, LSL #6 1311 add x8, x8, x5, LSL #3 1312 b 5f 1313 .align 4 13143: fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8 1315 1316 /* For each call to fetch two are made to \core. It would be 1317 * preferable to have twice the work done in \core. 1318 */ 1319 \core 1320 st1 {v15.8b}, [x0], #8 1321 \core 1322 st1 {v15.8b}, [x0], #8 1323 1324 sub x3, x3, #16 13255: subs x4, x4, #16 1326 bhs 3b 1327 adds x4, x4, #16 1328 bne 1f 1329 .if \step==1 1330 dup v10.8h, v9.h[7] 1331 dup v11.8h, v9.h[7] 1332 .else 1333 dup v10.2d, v9.d[1] 1334 dup v11.2d, v9.d[1] 1335 .endif 1336 b 4f 1337 13381: sub x1, x1, #16 1339 sub x15, x15, #16 1340 sub x19, x19, #16 1341 add x1, x1, x4 1342 add x15, x15, x4 1343 add x19, x19, x4 1344 bl fetch_generic_asm 1345 1346 .if \step==1 1347 dup v12.8h, v11.h[7] 1348 .else 1349 dup v12.2d, v11.d[1] 1350 .endif 1351 sub x4, xzr, x4 1352 tbz x4, #3, 1f 1353 mov v10.16b, v11.16b 1354 mov v11.16b, v12.16b 13551: tbz x4, #2, 1f 1356 ext v10.16b, v10.16b, v11.16b, #4*2 1357 ext v11.16b, v11.16b, v12.16b, #4*2 13581: tbz x4, #1, 1f 1359 ext v10.16b, v10.16b, v11.16b, #2*2 1360 ext v11.16b, v11.16b, v12.16b, #2*2 13611: tbz x4, #0, 4f 1362 ext v10.16b, v10.16b, v11.16b, #1*2 1363 ext v11.16b, v11.16b, v12.16b, #1*2 13644: cbz x3, 5f 13653: \core 1366 .if \step==1 1367 dup v11.8h, v11.h[7] 1368 .else 1369 dup v11.2d, v11.d[1] 1370 .endif 1371 subs x3, x3, #8 1372 blo 4f 1373 st1 {v15.8b}, [x0], #8 1374 beq 5f 1375 b 3b 13764: tbz x3, #2, 1f 1377 st1 {v15.s}[0], [x0], #4 1378 ext v15.16b, v15.16b, v15.16b, #4*2 13791: tbz x3, #1, 1f 1380 st1 {v15.h}[0], [x0], #2 1381 ext v15.16b, v15.16b, v15.16b, #2*2 13821: tbz x3, #0, 5f 1383 st1 {v15.b}[0], [x0], #1 1384 ext v15.16b, v15.16b, v15.16b, #1*2 13855: nop 1386.endm 1387 1388.irep r, TUNED_LIST1, 25 1389PRIVATE(convolve1_\r) 1390 stp x29,x30, [sp, #-16]! 1391 1392 prefetch step=1, max_r=\r 1393 1394 mainloop core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r 1395 1396 ldp x29,x30, [sp], #16 1397 ret 1398END(convolve1_\r) 1399.endr 1400 1401.irep r, TUNED_LIST4, 25 1402PRIVATE(convolve4_\r) 1403 sub x12, sp, #0x200 1404 bic x9, x12, #0x3fc 1405 mov sp, x9 1406 stp x12,x30, [sp, #-16]! 1407 1408 /* x9 now points to a buffer on the stack whose address has the low 1409 * 10 bits clear. This allows easy address calculation in the 1410 * wrap-around cases. 1411 */ 1412 1413 1414 prefetch step=4, max_r=\r 1415 1416 mainloop core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r 1417 1418 ldp x12,x30, [sp] 1419 add sp, x12, #0x200 1420 ret 1421END(convolve4_\r) 1422.endr 1423 1424/* void rsdIntrinsicBlurU1_K( 1425 * void *out, // x0 1426 * void *in, // x1 1427 * size_t w, // x2 1428 * size_t h, // x3 1429 * size_t p, // x4 1430 * size_t x, // x5 1431 * size_t y, // x6 1432 * size_t count, // x7 1433 * size_t r, // [sp] 1434 * uint16_t *tab); // [sp,#8] 1435 */ 1436ENTRY(rsdIntrinsicBlurU1_K) 1437 stp x19,x30, [sp, #-16]! 1438 sub x8, sp, #32 1439 sub sp, sp, #64 1440 st1 {v8.1d - v11.1d}, [sp] 1441 st1 {v12.1d - v15.1d}, [x8] 1442 mov x8, x5 // x 1443 ldr w5, [sp,#80] // r 1444 sub x9, x2, x8 1445 sub x10, x3, x6 1446 mov x2, x4 // pitch 1447 mov x3, x7 // count 1448 sub x7, x10, #1 1449 sub x9, x9, x3 1450 1451 ldr x12, [sp, #88] // tab 1452 1453 add x0, x0, x8 1454 add x1, x1, x8 1455 1456 cmp x6, x5 1457 csel x6, x5, x6, hs 1458 cmp x7, x5 1459 csel x7, x5, x7, hs 1460 cmp x8, x5 1461 csel x8, x5, x8, hs 1462 cmp x9, x5 1463 csel x9, x5, x8, hs 1464 1465 add x4, x8, x9 1466 add x4, x4, x3 1467 1468 sub x1, x1, x8 1469 1470 sub x13, xzr, x2 1471 msub x15, x2, x6, x1 1472 madd x19, x2, x7, x1 1473 1474 ld1 {v0.8h,v1.8h}, [x12], #32 1475 ld1 {v2.8h,v3.8h}, [x12], #32 1476 1477 adr x30, 1f 1478 .irep r, TUNED_LIST1 1479 cmp x5, #\r 1480 bls convolve1_\r 1481 .endr 1482 b convolve1_25 1483 14841: ld1 {v8.1d - v11.1d}, [sp], #32 1485 ld1 {v12.1d - v15.1d}, [sp], #32 1486 ldp x19,x30, [sp], #16 1487 ret 1488END(rsdIntrinsicBlurU1_K) 1489 1490/* void rsdIntrinsicBlurU4_K( 1491 * void *out, // x0 1492 * void *in, // x1 1493 * size_t w, // x2 1494 * size_t h, // x3 1495 * size_t p, // x4 1496 * size_t x, // x5 1497 * size_t y, // x6 1498 * size_t count, // x7 1499 * size_t r, // [sp] 1500 * uint16_t *tab); // [sp,#8] 1501 */ 1502ENTRY(rsdIntrinsicBlurU4_K) 1503 stp x19,x30, [sp, #-16]! 1504 sub x8, sp, #32 1505 sub sp, sp, #64 1506 st1 {v8.1d - v11.1d}, [sp] 1507 st1 {v12.1d - v15.1d}, [x8] 1508 mov x8, x5 // x 1509 ldr w5, [sp,#80] // r 1510 sub x9, x2, x8 1511 sub x10, x3, x6 1512 mov x2, x4 // pitch 1513 mov x3, x7 // count 1514 sub x7, x10, #1 1515 sub x9, x9, x3 1516 1517 ldr x12, [sp, #88] 1518 1519 add x0, x0, x8, LSL #2 1520 add x1, x1, x8, LSL #2 1521 1522 cmp x6, x5 1523 csel x6, x5, x6, hs 1524 cmp x7, x5 1525 csel x7, x5, x7, hs 1526 cmp x8, x5 1527 csel x8, x5, x8, hs 1528 cmp x9, x5 1529 csel x9, x5, x9, hs 1530 1531 lsl x3, x3, #2 1532 add x4, x8, x9 1533 add x4, x3, x4, LSL #2 1534 1535 sub x1, x1, x8, LSL #2 1536 1537 sub x13, xzr, x2 1538 msub x15, x2, x6, x1 1539 madd x19, x2, x7, x1 1540 1541 ld1 {v0.8h,v1.8h}, [x12], #32 1542 ld1 {v2.8h,v3.8h}, [x12], #32 1543 1544 adr x30, 1f 1545 .irep r, TUNED_LIST4 1546 cmp x5, #\r 1547 bls convolve4_\r 1548 .endr 1549 b convolve4_25 1550 15511: ld1 {v8.1d - v11.1d}, [sp], #32 1552 ld1 {v12.1d - v15.1d}, [sp], #32 1553 ldp x19,x30, [sp], #16 1554 ret 1555END(rsdIntrinsicBlurU4_K) 1556