rsCpuIntrinsics_advsimd_Blur.S revision 446788007efe0a673d0366284026adfa17b36fed
1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18#define END(f) .size f, .-f; 19 20.set FRACTION_BITS, 7 21.set MAX_R, 25 22 23 24/* A quick way of making a line of code conditional on some other condition. 25 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with 26 * `ifcc`: 27 */ 28.macro ifcc zzz:vararg 29.if cc 30 \zzz 31.endif 32.endm 33 34/* Fetch 16 columns of bytes (regardless of image format), convolve these 35 * vertically, and leave them in the register file. If working near the top or 36 * bottom of an image then clamp the addressing while loading the data in. 37 * 38 * The convolution is fully unrolled for windows up to max_r, with the 39 * outermost edges calculated first. This way it's possible to branch directly 40 * into the relevant part of the code for an arbitrary convolution radius. Two 41 * variants of the loop are produced; one eliminates the clamping code for a 42 * slight speed advantage. 43 * 44 * Where the macro is called with reg=x, the specified register is taken to 45 * contain a pre-calculated pointer into one of the two loops. 46 * 47 * Input: 48 * x1 -- src 49 * x2 -- pitch 50 * x5 -- r 51 * x6 -- rup 52 * x7 -- rdn 53 * x12 -- switch index 54 * q0-q3 -- coefficient table 55 * x13 = -pitch 56 * x15 = top-row in 57 * x16 = bottom-row in 58 * Output: 59 * x1 += 16 60 * q10,q11 -- 16 convolved columns 61 * Modifies: 62 * x10 = upper row pointer 63 * x11 = lower row pointer 64 * q12-q15 = temporary sums 65 */ 66.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/ 67 .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif 68 69 ld1 {v15.16b}, [x1], #16 70 mov x10, x15 71 72 uxtl v14.8h, v15.8b 73// prfm PLDL1KEEP,[x1, #16] // TODO: confirm 74 uxtl2 v15.8h, v15.16b 75 .if \max_r < 16 // approximate 76 ifcc adr \reg, 1f 77 .else 78 ifcc adrp \reg, 1f 79 ifcc add \reg, \reg, #:lo12:1f 80 .endif 81 82 umull v12.4s, v14.4h, v0.h[0] 83 ifcc sub \reg, \reg, x5, LSL #6 84 umull2 v13.4s, v14.8h, v0.h[0] 85 mov x11, x16 86 umull v14.4s, v15.4h, v0.h[0] 87 ifcc add \reg, \reg, x5, LSL #3 88 umull2 v15.4s, v15.8h, v0.h[0] 89 br \reg 90 91 .irp rowclamp, 1, 0 92 .set cc, \rowclamp 93 .align 4 94 .irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h 95 .set i, \dreg * 8 + \lane 96 .if 0 < i && i <= \max_r 97 ld1 {v10.16b}, [x10], x2 98 ifcc cmp x6, #i 99 ld1 {v11.16b}, [x11], x13 100 ifcc csel x10, x15, x10, lo 101 uaddl v16.8h, v10.8b, v11.8b 102 ifcc cmp x7, #i 103 uaddl2 v11.8h, v10.16b, v11.16b 104 ifcc csel x11, x16, x11, lo 105 umlal v12.4s, v16.4h, v\dreg\doth[\lane] 106 umlal2 v13.4s, v16.8h, v\dreg\doth[\lane] 107// prfm PLDL1KEEP,[x10, #32] // TODO: confirm 108nop 109 umlal v14.4s, v11.4h, v\dreg\doth[\lane] 110// prfm PLDL1KEEP,[x11, #32] // TODO: confirm 111nop 112 umlal2 v15.4s, v11.8h, v\dreg\doth[\lane] 113 .endif 114 .endr ; .endr ; .endr 115 .if \rowclamp == 1 116 1: \labelc : 117 b 2f 118 .else 119 2: \labelnc : 120 .endif 121 .endr 122 123 uqrshrn v10.4h, v12.4s, #16 - FRACTION_BITS 124 add x15, x15, #16 125 uqrshrn2 v10.8h, v13.4s, #16 - FRACTION_BITS 126 add x16, x16, #16 127 uqrshrn v11.4h, v14.4s, #16 - FRACTION_BITS 128 uqrshrn2 v11.8h, v15.4s, #16 - FRACTION_BITS 129.endm /*}}}*/ 130 131/* Some portion of the convolution window (as much as will fit, and all of it 132 * for the uchar1 cases) is kept in the register file to avoid unnecessary 133 * memory accesses. This forces the horizontal loops to be unrolled because 134 * there's no indexed addressing into the register file. 135 * 136 * As in the fetch macro, the operations are ordered from outside to inside, so 137 * that jumping into the middle of the block bypasses the unwanted window taps. 138 * 139 * There are several variants of the macro because of the fixed offets of the 140 * taps -- the wider the maximum radius the further the centre tap is from the 141 * most recently fetched data. This means that pre-filling the window requires 142 * more data that won't be used and it means that rotating the window involves 143 * more mov operations. 144 * 145 * When the buffer gets too big the buffer at [r9] is used. 146 * 147 * Input: 148 * q4-q11 -- convoltion window 149 * r9 -- pointer to additional convolution window data 150 * Output: 151 * r9 -- updated buffer pointer (if used) 152 * d31 -- result to be stored 153 * Modifies: 154 * r12 -- temp buffer pointer 155 * q12-q13 -- temporaries for load and vext operations. 156 * q14-q15 -- intermediate sums 157 */ 158#define TUNED_LIST1 8, 16 159.macro hconv1_8/*{{{*/ 160 umull v14.4s, v9.4h, v0.h[0] 161 umull2 v15.4s, v9.8h, v0.h[0] 162 163 adr x12, 199f-8 164 ldr x12, [x12, x5, LSL #3] 165 br x12 166 199: .xword 101f 167 .xword 102f 168 .xword 103f 169 .xword 104f 170 .xword 105f 171 .xword 106f 172 .xword 107f 173 .xword 108f 174 .align 4 175 108: umlal v14.4s, v8.4h, v1.h[0] 176 umlal2 v15.4s, v8.8h, v1.h[0] 177 umlal v14.4s, v10.4h, v1.h[0] 178 umlal2 v15.4s, v10.8h, v1.h[0] 179 107: ext v12.16b, v8.16b, v9.16b, #1*2 180 ext v13.16b, v9.16b, v10.16b, #7*2 181 umlal v14.4s, v12.4h, v0.h[7] 182 umlal2 v15.4s, v12.8h, v0.h[7] 183 umlal v14.4s, v13.4h, v0.h[7] 184 umlal2 v15.4s, v13.8h, v0.h[7] 185 106: ext v12.16b, v8.16b, v9.16b, #2*2 186 ext v13.16b, v9.16b, v10.16b, #6*2 187 umlal v14.4s, v12.4h, v0.h[6] 188 umlal2 v15.4s, v12.8h, v0.h[6] 189 umlal v14.4s, v13.4h, v0.h[6] 190 umlal2 v15.4s, v13.8h, v0.h[6] 191 105: ext v12.16b, v8.16b, v9.16b, #3*2 192 ext v13.16b, v9.16b, v10.16b, #5*2 193 umlal v14.4s, v12.4h, v0.h[5] 194 umlal2 v15.4s, v12.8h, v0.h[5] 195 umlal v14.4s, v13.4h, v0.h[5] 196 umlal2 v15.4s, v13.8h, v0.h[5] 197 104: //ext v12.16b, v8.16b, v9.16b, #4*2 198 //ext v13.16b, v9.16b, v10.16b, #4*2 199 umlal2 v14.4s, v8.8h, v0.h[4] 200 umlal v15.4s, v9.4h, v0.h[4] 201 umlal2 v14.4s, v9.8h, v0.h[4] 202 umlal v15.4s, v10.4h, v0.h[4] 203 103: ext v12.16b, v8.16b, v9.16b, #5*2 204 ext v13.16b, v9.16b, v10.16b, #3*2 205 umlal v14.4s, v12.4h, v0.h[3] 206 umlal2 v15.4s, v12.8h, v0.h[3] 207 umlal v14.4s, v13.4h, v0.h[3] 208 umlal2 v15.4s, v13.8h, v0.h[3] 209 102: ext v12.16b, v8.16b, v9.16b, #6*2 210 ext v13.16b, v9.16b, v10.16b, #2*2 211 umlal v14.4s, v12.4h, v0.h[2] 212 umlal2 v15.4s, v12.8h, v0.h[2] 213 umlal v14.4s, v13.4h, v0.h[2] 214 umlal2 v15.4s, v13.8h, v0.h[2] 215 101: ext v12.16b, v8.16b, v9.16b, #7*2 216 ext v13.16b, v9.16b, v10.16b, #1*2 217 umlal v14.4s, v12.4h, v0.h[1] 218 umlal2 v15.4s, v12.8h, v0.h[1] 219 umlal v14.4s, v13.4h, v0.h[1] 220 umlal2 v15.4s, v13.8h, v0.h[1] 221 222 uqrshrn v14.4h, v14.4s, #16 223 uqrshrn2 v14.8h, v15.4s, #16 224 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 225 226 mov v8.16b, v9.16b 227 mov v9.16b, v10.16b 228 mov v10.16b, v11.16b 229.endm/*}}}*/ 230 231.macro hconv1_16/*{{{*/ 232 umull v14.4s, v8.4h, v0.h[0] 233 umull2 v15.4s, v8.8h, v0.h[0] 234 235 adr x12, 199f-8 236 ldr x12, [x12, x5, LSL #3] 237 br x12 238 199: .xword 101f 239 .xword 102f 240 .xword 103f 241 .xword 104f 242 .xword 105f 243 .xword 106f 244 .xword 107f 245 .xword 108f 246 .xword 109f 247 .xword 110f 248 .xword 111f 249 .xword 112f 250 .xword 113f 251 .xword 114f 252 .xword 115f 253 .xword 116f 254 .align 4 255 116: //ext v12.16b, v6.16b, v7.16b, #0*2 256 //ext v13.16b, v10.16b, v11.16b, #0*2 257 umlal v14.4s, v6.4h, v2.h[0] 258 umlal2 v15.4s, v6.8h, v2.h[0] 259 umlal v14.4s, v10.4h, v2.h[0] 260 umlal2 v15.4s, v10.8h, v2.h[0] 261 115: ext v12.16b, v6.16b, v7.16b, #1*2 262 ext v13.16b, v9.16b, v10.16b, #7*2 263 umlal v14.4s, v12.4h, v1.h[7] 264 umlal2 v15.4s, v12.8h, v1.h[7] 265 umlal v14.4s, v13.4h, v1.h[7] 266 umlal2 v15.4s, v13.8h, v1.h[7] 267 114: ext v12.16b, v6.16b, v7.16b, #2*2 268 ext v13.16b, v9.16b, v10.16b, #6*2 269 umlal v14.4s, v12.4h, v1.h[6] 270 umlal2 v15.4s, v12.8h, v1.h[6] 271 umlal v14.4s, v13.4h, v1.h[6] 272 umlal2 v15.4s, v13.8h, v1.h[6] 273 113: ext v12.16b, v6.16b, v7.16b, #3*2 274 ext v13.16b, v9.16b, v10.16b, #5*2 275 umlal v14.4s, v12.4h, v1.h[5] 276 umlal2 v15.4s, v12.8h, v1.h[5] 277 umlal v14.4s, v13.4h, v1.h[5] 278 umlal2 v15.4s, v13.8h, v1.h[5] 279 112: //ext v12.16b, v6.16b, v7.16b, #4*2 280 //ext v13.16b, v9.16b, v10.16b, #4*2 281 umlal2 v14.4s, v6.8h, v1.h[4] 282 umlal v15.4s, v7.4h, v1.h[4] 283 umlal2 v14.4s, v9.8h, v1.h[4] 284 umlal v15.4s, v10.4h, v1.h[4] 285 111: ext v12.16b, v6.16b, v7.16b, #5*2 286 ext v13.16b, v9.16b, v10.16b, #3*2 287 umlal v14.4s, v12.4h, v1.h[3] 288 umlal2 v15.4s, v12.8h, v1.h[3] 289 umlal v14.4s, v13.4h, v1.h[3] 290 umlal2 v15.4s, v13.8h, v1.h[3] 291 110: ext v12.16b, v6.16b, v7.16b, #6*2 292 ext v13.16b, v9.16b, v10.16b, #2*2 293 umlal v14.4s, v12.4h, v1.h[2] 294 umlal2 v15.4s, v12.8h, v1.h[2] 295 umlal v14.4s, v13.4h, v1.h[2] 296 umlal2 v15.4s, v13.8h, v1.h[2] 297 109: ext v12.16b, v6.16b, v7.16b, #7*2 298 ext v13.16b, v9.16b, v10.16b, #1*2 299 umlal v14.4s, v12.4h, v1.h[1] 300 umlal2 v15.4s, v12.8h, v1.h[1] 301 umlal v14.4s, v13.4h, v1.h[1] 302 umlal2 v15.4s, v13.8h, v1.h[1] 303 108: //ext v12.16b, v7.16b, v8.16b, #0*2 304 //ext v13.16b, v9.16b, v10.16b, #0*2 305 umlal v14.4s, v7.4h, v1.h[0] 306 umlal2 v15.4s, v7.8h, v1.h[0] 307 umlal v14.4s, v9.4h, v1.h[0] 308 umlal2 v15.4s, v9.8h, v1.h[0] 309 107: ext v12.16b, v7.16b, v8.16b, #1*2 310 ext v13.16b, v8.16b, v9.16b, #7*2 311 umlal v14.4s, v12.4h, v0.h[7] 312 umlal2 v15.4s, v12.8h, v0.h[7] 313 umlal v14.4s, v13.4h, v0.h[7] 314 umlal2 v15.4s, v13.8h, v0.h[7] 315 106: ext v12.16b, v7.16b, v8.16b, #2*2 316 ext v13.16b, v8.16b, v9.16b, #6*2 317 umlal v14.4s, v12.4h, v0.h[6] 318 umlal2 v15.4s, v12.8h, v0.h[6] 319 umlal v14.4s, v13.4h, v0.h[6] 320 umlal2 v15.4s, v13.8h, v0.h[6] 321 105: ext v12.16b, v7.16b, v8.16b, #3*2 322 ext v13.16b, v8.16b, v9.16b, #5*2 323 umlal v14.4s, v12.4h, v0.h[5] 324 umlal2 v15.4s, v12.8h, v0.h[5] 325 umlal v14.4s, v13.4h, v0.h[5] 326 umlal2 v15.4s, v13.8h, v0.h[5] 327 104: //ext v12.16b, v7.16b, v8.16b, #4*2 328 //ext v13.16b, v8.16b, v9.16b, #4*2 329 umlal2 v14.4s, v7.8h, v0.h[4] 330 umlal v15.4s, v8.4h, v0.h[4] 331 umlal2 v14.4s, v8.8h, v0.h[4] 332 umlal v15.4s, v9.4h, v0.h[4] 333 103: ext v12.16b, v7.16b, v8.16b, #5*2 334 ext v13.16b, v8.16b, v9.16b, #3*2 335 umlal v14.4s, v12.4h, v0.h[3] 336 umlal2 v15.4s, v12.8h, v0.h[3] 337 umlal v14.4s, v13.4h, v0.h[3] 338 umlal2 v15.4s, v13.8h, v0.h[3] 339 102: ext v12.16b, v7.16b, v8.16b, #6*2 340 ext v13.16b, v8.16b, v9.16b, #2*2 341 umlal v14.4s, v12.4h, v0.h[2] 342 umlal2 v15.4s, v12.8h, v0.h[2] 343 umlal v14.4s, v13.4h, v0.h[2] 344 umlal2 v15.4s, v13.8h, v0.h[2] 345 101: ext v12.16b, v7.16b, v8.16b, #7*2 346 ext v13.16b, v8.16b, v9.16b, #1*2 347 umlal v14.4s, v12.4h, v0.h[1] 348 umlal2 v15.4s, v12.8h, v0.h[1] 349 umlal v14.4s, v13.4h, v0.h[1] 350 umlal2 v15.4s, v13.8h, v0.h[1] 351 352 uqrshrn v14.4h, v14.4s, #16 353 uqrshrn2 v14.8h, v15.4s, #16 354 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 355 356 mov v6.16b, v7.16b 357 mov v7.16b, v8.16b 358 mov v8.16b, v9.16b 359 mov v9.16b, v10.16b 360 mov v10.16b, v11.16b 361.endm/*}}}*/ 362 363.macro hconv1_25/*{{{*/ 364 ext v12.16b, v6.16b, v7.16b, #7*2 365 umull v14.4s, v12.4h, v0.h[0] 366 umull2 v15.4s, v12.8h, v0.h[0] 367 368 adr x12, 199f-8 369 ldr x12, [x12, x5, LSL #3] 370 br x12 371 199: .xword 101f 372 .xword 102f 373 .xword 103f 374 .xword 104f 375 .xword 105f 376 .xword 106f 377 .xword 107f 378 .xword 108f 379 .xword 109f 380 .xword 110f 381 .xword 111f 382 .xword 112f 383 .xword 113f 384 .xword 114f 385 .xword 115f 386 .xword 116f 387 .xword 117f 388 .xword 118f 389 .xword 119f 390 .xword 120f 391 .xword 121f 392 .xword 122f 393 .xword 123f 394 .xword 124f 395 .xword 125f 396 .align 4 397 125: ext v12.16b, v3.16b, v4.16b, #6*2 398 ext v13.16b, v10.16b, v11.16b, #0*2 399 umlal v14.4s, v12.4h, v3.h[1] 400 umlal2 v15.4s, v12.8h, v3.h[1] 401 umlal v14.4s, v13.4h, v3.h[1] 402 umlal2 v15.4s, v13.8h, v3.h[1] 403 124: ext v12.16b, v3.16b, v4.16b, #7*2 404 ext v13.16b, v9.16b, v10.16b, #7*2 405 umlal v14.4s, v12.4h, v3.h[0] 406 umlal2 v15.4s, v12.8h, v3.h[0] 407 umlal v14.4s, v13.4h, v3.h[0] 408 umlal2 v15.4s, v13.8h, v3.h[0] 409 123: ext v12.16b, v4.16b, v5.16b, #0*2 410 ext v13.16b, v9.16b, v10.16b, #6*2 411 umlal v14.4s, v12.4h, v2.h[7] 412 umlal2 v15.4s, v12.8h, v2.h[7] 413 umlal v14.4s, v13.4h, v2.h[7] 414 umlal2 v15.4s, v13.8h, v2.h[7] 415 122: ext v12.16b, v4.16b, v5.16b, #1*2 416 ext v13.16b, v9.16b, v10.16b, #5*2 417 umlal v14.4s, v12.4h, v2.h[6] 418 umlal2 v15.4s, v12.8h, v2.h[6] 419 umlal v14.4s, v13.4h, v2.h[6] 420 umlal2 v15.4s, v13.8h, v2.h[6] 421 121: ext v12.16b, v4.16b, v5.16b, #2*2 422 ext v13.16b, v9.16b, v10.16b, #4*2 423 umlal v14.4s, v12.4h, v2.h[5] 424 umlal2 v15.4s, v12.8h, v2.h[5] 425 umlal v14.4s, v13.4h, v2.h[5] 426 umlal2 v15.4s, v13.8h, v2.h[5] 427 120: ext v12.16b, v4.16b, v5.16b, #3*2 428 ext v13.16b, v9.16b, v10.16b, #3*2 429 umlal v14.4s, v12.4h, v2.h[4] 430 umlal2 v15.4s, v12.8h, v2.h[4] 431 umlal v14.4s, v13.4h, v2.h[4] 432 umlal2 v15.4s, v13.8h, v2.h[4] 433 119: ext v12.16b, v4.16b, v5.16b, #4*2 434 ext v13.16b, v9.16b, v10.16b, #2*2 435 umlal v14.4s, v12.4h, v2.h[3] 436 umlal2 v15.4s, v12.8h, v2.h[3] 437 umlal v14.4s, v13.4h, v2.h[3] 438 umlal2 v15.4s, v13.8h, v2.h[3] 439 118: ext v12.16b, v4.16b, v5.16b, #5*2 440 ext v13.16b, v9.16b, v10.16b, #1*2 441 umlal v14.4s, v12.4h, v2.h[2] 442 umlal2 v15.4s, v12.8h, v2.h[2] 443 umlal v14.4s, v13.4h, v2.h[2] 444 umlal2 v15.4s, v13.8h, v2.h[2] 445 117: ext v12.16b, v4.16b, v5.16b, #6*2 446 ext v13.16b, v9.16b, v10.16b, #0*2 447 umlal v14.4s, v12.4h, v2.h[1] 448 umlal2 v15.4s, v12.8h, v2.h[1] 449 umlal v14.4s, v13.4h, v2.h[1] 450 umlal2 v15.4s, v13.8h, v2.h[1] 451 116: ext v12.16b, v4.16b, v5.16b, #7*2 452 ext v13.16b, v8.16b, v9.16b, #7*2 453 umlal v14.4s, v12.4h, v2.h[0] 454 umlal2 v15.4s, v12.8h, v2.h[0] 455 umlal v14.4s, v13.4h, v2.h[0] 456 umlal2 v15.4s, v13.8h, v2.h[0] 457 115: ext v12.16b, v5.16b, v6.16b, #0*2 458 ext v13.16b, v8.16b, v9.16b, #6*2 459 umlal v14.4s, v12.4h, v1.h[7] 460 umlal2 v15.4s, v12.8h, v1.h[7] 461 umlal v14.4s, v13.4h, v1.h[7] 462 umlal2 v15.4s, v13.8h, v1.h[7] 463 114: ext v12.16b, v5.16b, v6.16b, #1*2 464 ext v13.16b, v8.16b, v9.16b, #5*2 465 umlal v14.4s, v12.4h, v1.h[6] 466 umlal2 v15.4s, v12.8h, v1.h[6] 467 umlal v14.4s, v13.4h, v1.h[6] 468 umlal2 v15.4s, v13.8h, v1.h[6] 469 113: ext v12.16b, v5.16b, v6.16b, #2*2 470 ext v13.16b, v8.16b, v9.16b, #4*2 471 umlal v14.4s, v12.4h, v1.h[5] 472 umlal2 v15.4s, v12.8h, v1.h[5] 473 umlal v14.4s, v13.4h, v1.h[5] 474 umlal2 v15.4s, v13.8h, v1.h[5] 475 112: ext v12.16b, v5.16b, v6.16b, #3*2 476 ext v13.16b, v8.16b, v9.16b, #3*2 477 umlal v14.4s, v12.4h, v1.h[4] 478 umlal2 v15.4s, v12.8h, v1.h[4] 479 umlal v14.4s, v13.4h, v1.h[4] 480 umlal2 v15.4s, v13.8h, v1.h[4] 481 111: ext v12.16b, v5.16b, v6.16b, #4*2 482 ext v13.16b, v8.16b, v9.16b, #2*2 483 umlal v14.4s, v12.4h, v1.h[3] 484 umlal2 v15.4s, v12.8h, v1.h[3] 485 umlal v14.4s, v13.4h, v1.h[3] 486 umlal2 v15.4s, v13.8h, v1.h[3] 487 110: ext v12.16b, v5.16b, v6.16b, #5*2 488 ext v13.16b, v8.16b, v9.16b, #1*2 489 umlal v14.4s, v12.4h, v1.h[2] 490 umlal2 v15.4s, v12.8h, v1.h[2] 491 umlal v14.4s, v13.4h, v1.h[2] 492 umlal2 v15.4s, v13.8h, v1.h[2] 493 109: ext v12.16b, v5.16b, v6.16b, #6*2 494 ext v13.16b, v8.16b, v9.16b, #0*2 495 umlal v14.4s, v12.4h, v1.h[1] 496 umlal2 v15.4s, v12.8h, v1.h[1] 497 umlal v14.4s, v13.4h, v1.h[1] 498 umlal2 v15.4s, v13.8h, v1.h[1] 499 108: ext v12.16b, v5.16b, v6.16b, #7*2 500 ext v13.16b, v7.16b, v8.16b, #7*2 501 umlal v14.4s, v12.4h, v1.h[0] 502 umlal2 v15.4s, v12.8h, v1.h[0] 503 umlal v14.4s, v13.4h, v1.h[0] 504 umlal2 v15.4s, v13.8h, v1.h[0] 505 107: ext v12.16b, v6.16b, v7.16b, #0*2 506 ext v13.16b, v7.16b, v8.16b, #6*2 507 umlal v14.4s, v12.4h, v0.h[7] 508 umlal2 v15.4s, v12.8h, v0.h[7] 509 umlal v14.4s, v13.4h, v0.h[7] 510 umlal2 v15.4s, v13.8h, v0.h[7] 511 106: ext v12.16b, v6.16b, v7.16b, #1*2 512 ext v13.16b, v7.16b, v8.16b, #5*2 513 umlal v14.4s, v12.4h, v0.h[6] 514 umlal2 v15.4s, v12.8h, v0.h[6] 515 umlal v14.4s, v13.4h, v0.h[6] 516 umlal2 v15.4s, v13.8h, v0.h[6] 517 105: ext v12.16b, v6.16b, v7.16b, #2*2 518 ext v13.16b, v7.16b, v8.16b, #4*2 519 umlal v14.4s, v12.4h, v0.h[5] 520 umlal2 v15.4s, v12.8h, v0.h[5] 521 umlal v14.4s, v13.4h, v0.h[5] 522 umlal2 v15.4s, v13.8h, v0.h[5] 523 104: ext v12.16b, v6.16b, v7.16b, #3*2 524 ext v13.16b, v7.16b, v8.16b, #3*2 525 umlal v14.4s, v12.4h, v0.h[4] 526 umlal2 v15.4s, v12.8h, v0.h[4] 527 umlal v14.4s, v13.4h, v0.h[4] 528 umlal2 v15.4s, v13.8h, v0.h[4] 529 103: ext v12.16b, v6.16b, v7.16b, #4*2 530 ext v13.16b, v7.16b, v8.16b, #2*2 531 umlal v14.4s, v12.4h, v0.h[3] 532 umlal2 v15.4s, v12.8h, v0.h[3] 533 umlal v14.4s, v13.4h, v0.h[3] 534 umlal2 v15.4s, v13.8h, v0.h[3] 535 102: ext v12.16b, v6.16b, v7.16b, #5*2 536 ext v13.16b, v7.16b, v8.16b, #1*2 537 umlal v14.4s, v12.4h, v0.h[2] 538 umlal2 v15.4s, v12.8h, v0.h[2] 539 umlal v14.4s, v13.4h, v0.h[2] 540 umlal2 v15.4s, v13.8h, v0.h[2] 541 101: ext v12.16b, v6.16b, v7.16b, #6*2 542 ext v13.16b, v7.16b, v8.16b, #0*2 543 umlal v14.4s, v12.4h, v0.h[1] 544 umlal2 v15.4s, v12.8h, v0.h[1] 545 umlal v14.4s, v13.4h, v0.h[1] 546 umlal2 v15.4s, v13.8h, v0.h[1] 547 548 uqrshrn v14.4h, v14.4s, #16 549 uqrshrn2 v14.8h, v15.4s, #16 550 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 551 552 ins v3.d[1], v4.d[0] 553 mov v4.16b, v5.16b 554 mov v5.16b, v6.16b 555 mov v6.16b, v7.16b 556 mov v7.16b, v8.16b 557 mov v8.16b, v9.16b 558 mov v9.16b, v10.16b 559 mov v10.16b, v11.16b 560.endm/*}}}*/ 561 562#define TUNED_LIST4 6, 12 563.macro hconv4_6/*{{{*/ 564 umull v14.4s, v7.4h, v0.h[0] 565 umull2 v15.4s, v7.8h, v0.h[0] 566 567 adr x12, 199f-8 568 ldr x12, [x12, x5, LSL #3] 569 br x12 570 199: .xword 101f 571 .xword 102f 572 .xword 103f 573 .xword 104f 574 .xword 105f 575 .xword 106f 576 .align 4 577 106: umlal v14.4s, v4.4h, v0.h[6] 578 umlal2 v15.4s, v4.8h, v0.h[6] 579 umlal v14.4s, v10.4h, v0.h[6] 580 umlal2 v15.4s, v10.8h, v0.h[6] 581 105: umlal2 v14.4s, v4.8h, v0.h[5] 582 umlal v15.4s, v5.4h, v0.h[5] 583 umlal2 v14.4s, v9.8h, v0.h[5] 584 umlal v15.4s, v10.4h, v0.h[5] 585 104: umlal v14.4s, v5.4h, v0.h[4] 586 umlal2 v15.4s, v5.8h, v0.h[4] 587 umlal v14.4s, v9.4h, v0.h[4] 588 umlal2 v15.4s, v9.8h, v0.h[4] 589 103: umlal2 v14.4s, v5.8h, v0.h[3] 590 umlal v15.4s, v6.4h, v0.h[3] 591 umlal2 v14.4s, v8.8h, v0.h[3] 592 umlal v15.4s, v9.4h, v0.h[3] 593 102: umlal v14.4s, v6.4h, v0.h[2] 594 umlal2 v15.4s, v6.8h, v0.h[2] 595 umlal v14.4s, v8.4h, v0.h[2] 596 umlal2 v15.4s, v8.8h, v0.h[2] 597 101: umlal2 v14.4s, v6.8h, v0.h[1] 598 umlal v15.4s, v7.4h, v0.h[1] 599 umlal2 v14.4s, v7.8h, v0.h[1] 600 umlal v15.4s, v8.4h, v0.h[1] 601 602 uqrshrn v14.4h, v14.4s, #16 603 uqrshrn2 v14.8h, v15.4s, #16 604 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 605 606 mov v4.16b, v5.16b 607 mov v5.16b, v6.16b 608 mov v6.16b, v7.16b 609 mov v7.16b, v8.16b 610 mov v8.16b, v9.16b 611 mov v9.16b, v10.16b 612 mov v10.16b, v11.16b 613.endm/*}}}*/ 614 615.macro hconv4_12/*{{{*/ 616 umull v14.4s, v4.4h, v0.h[0] 617 umull2 v15.4s, v4.8h, v0.h[0] 618 619 adr x12, 199f-8 620 ldr x12, [x12, x5, LSL #3] 621 br x12 622 199: .xword 101f 623 .xword 102f 624 .xword 103f 625 .xword 104f 626 .xword 105f 627 .xword 106f 628 .xword 107f 629 .xword 108f 630 .xword 109f 631 .xword 110f 632 .xword 111f 633 .xword 112f 634 .align 4 635 112: add x12, x9, #0x1a0 636 bic x12, x12, #0x200 637 ld1 {v12.8h}, [x12] 638 umlal v14.4s, v12.4h, v1.h[4] 639 umlal2 v15.4s, v12.8h, v1.h[4] 640 umlal v14.4s, v10.4h, v1.h[4] 641 umlal2 v15.4s, v10.8h, v1.h[4] 642 111: add x12, x9, #0x1a8 643 bic x12, x12, #0x200 644 ld1 {v12.4h}, [x12], #8 645 bic x12, x12, #0x200 646 ld1 {v13.4h}, [x12] 647 umlal v14.4s, v12.4h, v1.h[3] 648 umlal v15.4s, v13.4h, v1.h[3] 649 umlal2 v14.4s, v9.8h, v1.h[3] 650 umlal v15.4s, v10.4h, v1.h[3] 651 110: add x12, x9, #0x1b0 652 bic x12, x12, #0x200 653 ld1 {v12.8h}, [x12] 654 umlal v14.4s, v12.4h, v1.h[2] 655 umlal2 v15.4s, v12.8h, v1.h[2] 656 umlal v14.4s, v9.4h, v1.h[2] 657 umlal2 v15.4s, v9.8h, v1.h[2] 658 109: add x12, x9, #0x1b8 659 bic x12, x12, #0x200 660 ld1 {v12.4h}, [x12], #8 661 bic x12, x12, #0x200 662 ld1 {v13.4h}, [x12] 663 umlal v14.4s, v12.4h, v1.h[1] 664 umlal v15.4s, v13.4h, v1.h[1] 665 umlal2 v14.4s, v8.8h, v1.h[1] 666 umlal v15.4s, v9.4h, v1.h[1] 667 108: add x12, x9, #0x1c0 668 bic x12, x12, #0x200 669 ld1 {v12.8h}, [x12] 670 umlal v14.4s, v12.4h, v1.h[0] 671 umlal2 v15.4s, v12.8h, v1.h[0] 672 umlal v14.4s, v8.4h, v1.h[0] 673 umlal2 v15.4s, v8.8h, v1.h[0] 674 107: add x12, x9, #0x1c8 675 bic x12, x12, #0x200 676 ld1 {v12.4h}, [x12], #8 677 bic x12, x12, #0x200 678 ld1 {v13.4h}, [x12] 679 umlal v14.4s, v12.4h, v0.h[7] 680 umlal v15.4s, v13.4h, v0.h[7] 681 umlal2 v14.4s, v7.8h, v0.h[7] 682 umlal v15.4s, v8.4h, v0.h[7] 683 106: add x12, x9, #0x1d0 684 bic x12, x12, #0x200 685 ld1 {v12.8h}, [x12] 686 umlal v14.4s, v12.4h, v0.h[6] 687 umlal2 v15.4s, v12.8h, v0.h[6] 688 umlal v14.4s, v7.4h, v0.h[6] 689 umlal2 v15.4s, v7.8h, v0.h[6] 690 105: add x12, x9, #0x1d8 691 bic x12, x12, #0x200 692 ld1 {v12.4h}, [x12], #8 693 bic x12, x12, #0x200 694 ld1 {v13.4h}, [x12] 695 umlal v14.4s, v12.4h, v0.h[5] 696 umlal v15.4s, v13.4h, v0.h[5] 697 umlal2 v14.4s, v6.8h, v0.h[5] 698 umlal v15.4s, v7.4h, v0.h[5] 699 104: add x12, x9, #0x1e0 700 bic x12, x12, #0x200 701 ld1 {v12.8h}, [x12] 702 umlal v14.4s, v12.4h, v0.h[4] 703 umlal2 v15.4s, v12.8h, v0.h[4] 704 umlal v14.4s, v6.4h, v0.h[4] 705 umlal2 v15.4s, v6.8h, v0.h[4] 706 103: add x12, x9, #0x1e8 707 bic x12, x12, #0x200 708 ld1 {v12.4h}, [x12], #8 709 bic x12, x12, #0x200 710 ld1 {v13.4h}, [x12] 711 umlal v14.4s, v12.4h, v0.h[3] 712 umlal v15.4s, v13.4h, v0.h[3] 713 umlal2 v14.4s, v5.8h, v0.h[3] 714 umlal v15.4s, v6.4h, v0.h[3] 715 102: add x12, x9, #0x1f0 716 bic x12, x12, #0x200 717 ld1 {v12.8h}, [x12] 718 umlal v14.4s, v12.4h, v0.h[2] 719 umlal2 v15.4s, v12.8h, v0.h[2] 720 umlal v14.4s, v5.4h, v0.h[2] 721 umlal2 v15.4s, v5.8h, v0.h[2] 722 101: add x12, x9, #0x1f8 723 bic x12, x12, #0x200 724 ld1 {v12.4h}, [x12] 725 umlal v14.4s, v12.4h, v0.h[1] 726 umlal v15.4s, v4.4h, v0.h[1] 727 umlal2 v14.4s, v4.8h, v0.h[1] 728 umlal v15.4s, v5.4h, v0.h[1] 729 730 uqrshrn v14.4h, v14.4s, #16 731 uqrshrn2 v14.8h, v15.4s, #16 732 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 733 734 st1 {v4.16b}, [x9], #16 735 bic x9, x9, #0x200 736 mov v4.16b, v5.16b 737 mov v5.16b, v6.16b 738 mov v6.16b, v7.16b 739 mov v7.16b, v8.16b 740 mov v8.16b, v9.16b 741 mov v9.16b, v10.16b 742 mov v10.16b, v11.16b 743.endm/*}}}*/ 744 745.macro hconv4_25/*{{{*/ 746 add x12, x9, #0x198 747 bic x12, x12, #0x200 748 ld1 {v12.4h}, [x12], #8 749 bic x12, x12, #0x200 750 ld1 {v13.4h}, [x12] 751 umull v14.4s, v12.4h, v0.h[0] 752 umull v15.4s, v13.4h, v0.h[0] 753 754 adr x12, 199f-8 755 ldr x12, [x12, x5, LSL #3] 756 br x12 757 199: .xword 101f 758 .xword 102f 759 .xword 103f 760 .xword 104f 761 .xword 105f 762 .xword 106f 763 .xword 107f 764 .xword 108f 765 .xword 109f 766 .xword 110f 767 .xword 111f 768 .xword 112f 769 .xword 113f 770 .xword 114f 771 .xword 115f 772 .xword 116f 773 .xword 117f 774 .xword 118f 775 .xword 119f 776 .xword 120f 777 .xword 121f 778 .xword 122f 779 .xword 123f 780 .xword 124f 781 .xword 125f 782 .align 4 783 125: add x12, x9, #0x0d0 784 bic x12, x12, #0x200 785 ld1 {v12.8h}, [x12] 786 umlal v14.4s, v12.4h, v3.h[1] 787 umlal2 v15.4s, v12.8h, v3.h[1] 788 umlal v14.4s, v10.4h, v3.h[1] 789 umlal2 v15.4s, v10.8h, v3.h[1] 790 124: add x12, x9, #0x0d8 791 bic x12, x12, #0x200 792 ld1 {v12.4h}, [x12], #8 793 bic x12, x12, #0x200 794 ld1 {v13.4h}, [x12] 795 umlal v14.4s, v12.4h, v3.h[0] 796 umlal v15.4s, v13.4h, v3.h[0] 797 umlal2 v14.4s, v9.8h, v3.h[0] 798 umlal v15.4s, v10.4h, v3.h[0] 799 123: add x12, x9, #0x0e0 800 bic x12, x12, #0x200 801 ld1 {v12.8h}, [x12] 802 umlal v14.4s, v12.4h, v2.h[7] 803 umlal2 v15.4s, v12.8h, v2.h[7] 804 umlal v14.4s, v9.4h, v2.h[7] 805 umlal2 v15.4s, v9.8h, v2.h[7] 806 122: add x12, x9, #0x0e8 807 bic x12, x12, #0x200 808 ld1 {v12.4h}, [x12], #8 809 bic x12, x12, #0x200 810 ld1 {v13.4h}, [x12] 811 umlal v14.4s, v12.4h, v2.h[6] 812 umlal v15.4s, v13.4h, v2.h[6] 813 umlal2 v14.4s, v8.8h, v2.h[6] 814 umlal v15.4s, v9.4h, v2.h[6] 815 121: add x12, x9, #0x0f0 816 bic x12, x12, #0x200 817 ld1 {v12.8h}, [x12] 818 umlal v14.4s, v12.4h, v2.h[5] 819 umlal2 v15.4s, v12.8h, v2.h[5] 820 umlal v14.4s, v8.4h, v2.h[5] 821 umlal2 v15.4s, v8.8h, v2.h[5] 822 120: add x12, x9, #0x0f8 823 bic x12, x12, #0x200 824 ld1 {v12.4h}, [x12], #8 825 bic x12, x12, #0x200 826 ld1 {v13.4h}, [x12] 827 umlal v14.4s, v12.4h, v2.h[4] 828 umlal v15.4s, v13.4h, v2.h[4] 829 umlal2 v14.4s, v7.8h, v2.h[4] 830 umlal v15.4s, v8.4h, v2.h[4] 831 119: add x12, x9, #0x100 832 bic x12, x12, #0x200 833 ld1 {v12.8h}, [x12] 834 umlal v14.4s, v12.4h, v2.h[3] 835 umlal2 v15.4s, v12.8h, v2.h[3] 836 umlal v14.4s, v7.4h, v2.h[3] 837 umlal2 v15.4s, v7.8h, v2.h[3] 838 118: add x12, x9, #0x108 839 bic x12, x12, #0x200 840 ld1 {v12.4h}, [x12], #8 841 bic x12, x12, #0x200 842 ld1 {v13.4h}, [x12] 843 umlal v14.4s, v12.4h, v2.h[2] 844 umlal v15.4s, v13.4h, v2.h[2] 845 umlal2 v14.4s, v6.8h, v2.h[2] 846 umlal v15.4s, v7.4h, v2.h[2] 847 117: add x12, x9, #0x110 848 bic x12, x12, #0x200 849 ld1 {v12.8h}, [x12] 850 umlal v14.4s, v12.4h, v2.h[1] 851 umlal2 v15.4s, v12.8h, v2.h[1] 852 umlal v14.4s, v6.4h, v2.h[1] 853 umlal2 v15.4s, v6.8h, v2.h[1] 854 116: add x12, x9, #0x118 855 bic x12, x12, #0x200 856 ld1 {v12.4h}, [x12], #8 857 bic x12, x12, #0x200 858 ld1 {v13.4h}, [x12] 859 umlal v14.4s, v12.4h, v2.h[0] 860 umlal v15.4s, v13.4h, v2.h[0] 861 umlal2 v14.4s, v5.8h, v2.h[0] 862 umlal v15.4s, v6.4h, v2.h[0] 863 115: add x12, x9, #0x120 864 bic x12, x12, #0x200 865 ld1 {v12.8h}, [x12] 866 umlal v14.4s, v12.4h, v1.h[7] 867 umlal2 v15.4s, v12.8h, v1.h[7] 868 umlal v14.4s, v5.4h, v1.h[7] 869 umlal2 v15.4s, v5.8h, v1.h[7] 870 114: add x12, x9, #0x128 871 bic x12, x12, #0x200 872 ld1 {v12.4h}, [x12], #8 873 bic x12, x12, #0x200 874 ld1 {v13.4h}, [x12] 875 umlal v14.4s, v12.4h, v1.h[6] 876 umlal v15.4s, v13.4h, v1.h[6] 877 umlal2 v14.4s, v4.8h, v1.h[6] 878 umlal v15.4s, v5.4h, v1.h[6] 879 113: add x12, x9, #0x130 880 bic x12, x12, #0x200 881 ld1 {v12.8h}, [x12] 882 umlal v14.4s, v12.4h, v1.h[5] 883 umlal2 v15.4s, v12.8h, v1.h[5] 884 umlal v14.4s, v4.4h, v1.h[5] 885 umlal2 v15.4s, v4.8h, v1.h[5] 886 112: add x12, x9, #0x138 887 bic x12, x12, #0x200 888 ld1 {v12.4h}, [x12], #8 889 bic x12, x12, #0x200 890 ld1 {v16.4h}, [x12] 891 add x12, x9, #0x1f8 892 bic x12, x12, #0x200 893 ld1 {v13.4h}, [x12] 894 umlal v14.4s, v12.4h, v1.h[4] 895 umlal v15.4s, v16.4h, v1.h[4] 896 umlal v14.4s, v13.4h, v1.h[4] // Could be d7, without the load, right? 897 umlal v15.4s, v4.4h, v1.h[4] 898 111: add x12, x9, #0x140 899 bic x12, x12, #0x200 900 ld1 {v12.8h}, [x12] 901 add x12, x9, #0x1f0 902 bic x12, x12, #0x200 903 ld1 {v13.8h}, [x12] 904 umlal v14.4s, v12.4h, v1.h[3] 905 umlal2 v15.4s, v12.8h, v1.h[3] 906 umlal v14.4s, v13.4h, v1.h[3] 907 umlal2 v15.4s, v13.8h, v1.h[3] 908 110: add x12, x9, #0x148 909 bic x12, x12, #0x200 910 ld1 {v12.4h}, [x12], #8 911 bic x12, x12, #0x200 912 ld1 {v16.4h}, [x12] 913 add x12, x9, #0x1e8 914 bic x12, x12, #0x200 915 ld1 {v13.4h}, [x12], #8 916 bic x12, x12, #0x200 917 ld1 {v17.4h}, [x12] 918 umlal v14.4s, v12.4h, v1.h[2] 919 umlal v15.4s, v16.4h, v1.h[2] 920 umlal v14.4s, v13.4h, v1.h[2] 921 umlal v15.4s, v17.4h, v1.h[2] 922 109: add x12, x9, #0x150 923 bic x12, x12, #0x200 924 ld1 {v12.8h}, [x12] 925 add x12, x9, #0x1e0 926 bic x12, x12, #0x200 927 ld1 {v13.8h}, [x12] 928 umlal v14.4s, v12.4h, v1.h[1] 929 umlal2 v15.4s, v12.8h, v1.h[1] 930 umlal v14.4s, v13.4h, v1.h[1] 931 umlal2 v15.4s, v13.8h, v1.h[1] 932 108: add x12, x9, #0x158 933 bic x12, x12, #0x200 934 ld1 {v12.4h}, [x12], #8 935 bic x12, x12, #0x200 936 ld1 {v16.4h}, [x12] 937 add x12, x9, #0x1d8 938 bic x12, x12, #0x200 939 ld1 {v13.4h}, [x12], #8 940 bic x12, x12, #0x200 941 ld1 {v17.4h}, [x12] 942 umlal v14.4s, v12.4h, v1.h[0] 943 umlal v15.4s, v16.4h, v1.h[0] 944 umlal v14.4s, v13.4h, v1.h[0] 945 umlal v15.4s, v17.4h, v1.h[0] 946 107: add x12, x9, #0x160 947 bic x12, x12, #0x200 948 ld1 {v12.8h}, [x12] 949 add x12, x9, #0x1d0 950 bic x12, x12, #0x200 951 ld1 {v13.8h}, [x12] 952 umlal v14.4s, v12.4h, v0.h[7] 953 umlal2 v15.4s, v12.8h, v0.h[7] 954 umlal v14.4s, v13.4h, v0.h[7] 955 umlal2 v15.4s, v13.8h, v0.h[7] 956 106: add x12, x9, #0x168 957 bic x12, x12, #0x200 958 ld1 {v12.4h}, [x12], #8 959 bic x12, x12, #0x200 960 ld1 {v16.4h}, [x12] 961 add x12, x9, #0x1c8 962 bic x12, x12, #0x200 963 ld1 {v13.4h}, [x12], #8 964 bic x12, x12, #0x200 965 ld1 {v17.4h}, [x12] 966 umlal v14.4s, v12.4h, v0.h[6] 967 umlal v15.4s, v16.4h, v0.h[6] 968 umlal v14.4s, v13.4h, v0.h[6] 969 umlal v15.4s, v17.4h, v0.h[6] 970 105: add x12, x9, #0x170 971 bic x12, x12, #0x200 972 ld1 {v12.8h}, [x12] 973 add x12, x9, #0x1c0 974 bic x12, x12, #0x200 975 ld1 {v13.8h}, [x12] 976 umlal v14.4s, v12.4h, v0.h[5] 977 umlal2 v15.4s, v12.8h, v0.h[5] 978 umlal v14.4s, v13.4h, v0.h[5] 979 umlal2 v15.4s, v13.8h, v0.h[5] 980 104: add x12, x9, #0x178 981 bic x12, x12, #0x200 982 ld1 {v12.4h}, [x12], #8 983 bic x12, x12, #0x200 984 ld1 {v16.4h}, [x12] 985 add x12, x9, #0x1b8 986 bic x12, x12, #0x200 987 ld1 {v13.4h}, [x12], #8 988 bic x12, x12, #0x200 989 ld1 {v17.4h}, [x12] 990 umlal v14.4s, v12.4h, v0.h[4] 991 umlal v15.4s, v16.4h, v0.h[4] 992 umlal v14.4s, v13.4h, v0.h[4] 993 umlal v15.4s, v17.4h, v0.h[4] 994 103: add x12, x9, #0x180 995 bic x12, x12, #0x200 996 ld1 {v12.8h}, [x12] 997 add x12, x9, #0x1b0 998 bic x12, x12, #0x200 999 ld1 {v13.8h}, [x12] 1000 umlal v14.4s, v12.4h, v0.h[3] 1001 umlal2 v15.4s, v12.8h, v0.h[3] 1002 umlal v14.4s, v13.4h, v0.h[3] 1003 umlal2 v15.4s, v13.8h, v0.h[3] 1004 102: add x12, x9, #0x188 1005 bic x12, x12, #0x200 1006 ld1 {v12.4h}, [x12], #8 1007 bic x12, x12, #0x200 1008 ld1 {v16.4h}, [x12] 1009 add x12, x9, #0x1a8 1010 bic x12, x12, #0x200 1011 ld1 {v13.4h}, [x12], #8 1012 bic x12, x12, #0x200 1013 ld1 {v17.4h}, [x12] 1014 umlal v14.4s, v12.4h, v0.h[2] 1015 umlal v15.4s, v16.4h, v0.h[2] 1016 umlal v14.4s, v13.4h, v0.h[2] 1017 umlal v15.4s, v17.4h, v0.h[2] 1018 101: add x12, x9, #0x190 1019 bic x12, x12, #0x200 1020 ld1 {v12.8h}, [x12], #16 1021 bic x12, x12, #0x200 1022 ld1 {v13.8h}, [x12] 1023 umlal v14.4s, v12.4h, v0.h[1] 1024 umlal2 v15.4s, v12.8h, v0.h[1] 1025 umlal v14.4s, v13.4h, v0.h[1] 1026 umlal2 v15.4s, v13.8h, v0.h[1] 1027 1028 uqrshrn v14.4h, v14.4s, #16 1029 uqrshrn2 v14.8h, v15.4s, #16 1030 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 1031 1032 st1 {v4.16b}, [x9], #16 1033 bic x9, x9, #0x200 1034 mov v4.16b, v5.16b 1035 mov v5.16b, v6.16b 1036 mov v6.16b, v7.16b 1037 mov v7.16b, v8.16b 1038 mov v8.16b, v9.16b 1039 mov v9.16b, v10.16b 1040 mov v10.16b, v11.16b 1041.endm/*}}}*/ 1042 1043/* Dedicated function wrapper for the fetch macro, for the cases where 1044 * performance isn't that important, to keep code size down. 1045 */ 1046ENTRY(fetch_generic_asm) 1047 stp x10, x11, [sp, #-16]! 1048 fetch 1049 ldp x10, x11, [sp], #16 1050 ret 1051END(fetch_generic_asm) 1052 1053/* Given values in q10 and q11, and an index in x11, sweep the (x11&15)th value 1054 * across to fill the rest of the register pair. Used for filling the right 1055 * hand edge of the window when starting too close to the right hand edge of 1056 * the image. 1057 */ 1058ENTRY(prefetch_clamp1) 1059 sub x11, xzr, x11 1060 sub x15, x15, x1 1061 sub x16, x16, x1 1062 tbz x11, #3, 1f 1063 mov v11.16b, v10.16b 1064 sub x1, x1, #16 10651: mov v12.16b, v11.16b 1066 movi v13.8b, #0xff 1067 tbz x11, #2, 1f 1068 ext v12.16b, v12.16b, v12.16b, #4*2 1069 sub x1, x1, #8 1070 shl v13.2d, v13.2d, #32 10711: tbz x11, #1, 1f 1072 ext v12.16b, v12.16b, v12.16b, #6*2 1073 sub x1, x1, #4 1074 shl v13.2d, v13.2d, #16 10751: tbz x11, #0, 1f 1076 ext v12.16b, v12.16b, v12.16b, #7*2 1077 sub x1, x1, #2 1078 shl v13.2d, v13.2d, #8 10791: dup v12.8h, v12.h[6] 1080 sxtl v13.8h, v13.8b 1081 bif v11.16b, v12.16b, v13.16b 10821: tbz x11, #3, 1f 1083 mov v10.16b, v11.16b 1084 mov v11.16b, v12.16b 10851: sub x11, xzr, x11 1086 add x15, x15, x1 1087 add x16, x16, x1 1088 ret 1089END(prefetch_clamp1) 1090 1091ENTRY(prefetch_clamp4) 1092 sub x11, xzr, x11 1093 sub x15, x15, x1 1094 sub x16, x16, x1 1095 tbz x11, #3, 1f 1096 sub x1, x1, #16 // what's this? 1097 mov v11.16b, v10.16b 10981: dup v12.2d, v11.d[1] 1099 tbz x11, #2, 1f 1100 dup v12.2d, v11.d[0] 1101 sub x1, x1, #8 1102 dup v11.2d, v11.d[0] 11031: tbz x11, #3, 1f 1104 mov v10.16b, v11.16b 1105 mov v11.16b, v12.16b 11061: sub x11, xzr, x11 1107 add x15, x15, x1 1108 add x16, x16, x1 1109 ret 1110END(prefetch_clamp4) 1111 1112 1113/* Helpers for prefetch, below. 1114 */ 1115.macro prefetch_out qa, qb, store, qsa, qsb, qsb_hi 1116 .if \store > 0 1117 .ifc \qsa,\qsb 1118 st1 {\qsa}, [x9], #16 1119 st1 {\qsb}, [x9], #16 1120 .else 1121 st1 {\qsa,\qsb}, [x9], #32 1122 .endif 1123 .elseif \store == 0 1124 mov \qa, \qsa 1125 mov \qb, \qsb 1126 .else 1127 ins \qb, \qsb_hi 1128 .endif 1129.endm 1130 1131.macro prefetch_one qa, qb, rem, c, store=0, step=1 1132.set i, (need - 16) - \rem 1133.if i >= 0 11341: cmp x10, #i+16 1135 blo 2f 1136 prefetch_out \qa, \qb, \store, v9.16b, v9.16b, v9.d[1] 1137 b 1f 11382: cmp x11, #i+16 1139 bls 3f 1140 prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1] 1141 bl fetch_generic_asm 1142 b 2f 11433: bl prefetch_clamp\step 1144 prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1] 11454: b 4f+4 1146 //v12 contains pad word from prefetch_clamp call 1147 prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1] 1148 .if \rem > 0 1149 b 4f+4 1150 .else 11511: 11522: 11533: 11544: nop 1155 .endif 1156.endif 1157.endm 1158 1159/* Fill the convolution window with context data. The aim here is to load 1160 * exactly rlf + rrt columns, and in the main loop to read as many columns as 1161 * will be written. This is complicated by the need to handle cases when the 1162 * input starts very close to the left or right (or both) edges of the image, 1163 * and where these do not fall on 16-byte boundaries. 1164 * 1165 * Input: 1166 * x1 -- src 1167 * x2 -- pitch 1168 * x3 -- count 1169 * x4 -- inlen 1170 * x5 -- r 1171 * x6 -- rup 1172 * x7 -- rdn 1173 * x8 -- rlf 1174 * x9 -- buffer (if needed) 1175 * x13 = -pitch 1176 * x15 = top-row in 1177 * x16 = bottom-row in 1178 * Output: 1179 * x1 += rlf + min(count, rrt) 1180 * Modifies: 1181 * x10 -- fill start index in the window 1182 * x11 -- fill stop index in the window 1183 * x12 -- scratch 1184 */ 1185.macro prefetch step=1, max_r=25 1186.set need, ((\max_r + \max_r) * \step + 15) & ~15 1187 .if \step == 1 1188 mov x10, #need - (\max_r * \step) 1189 sub x10, x10, x8 1190 .else 1191 mov x10, #need - (\max_r * \step) 1192 sub x10, x10, x8, LSL #2 1193 .endif 1194 add x11, x10, x4 1195 subs x11, x11, #need 1196 csel x11, xzr, x11, hi 1197 add x11, x11, #need 1198 1199 bl fetch_generic_asm 1200 .if \step == 1 1201 dup v9.8h, v10.h[0] 1202 .else 1203 dup v9.2d, v10.d[0] 1204 .endif 1205 tst x10, #15 1206 beq 2f 1207 sub x12, xzr, x10 1208 tbz x10, #3, 1f 1209 mov v11.16b, v10.16b 1210 mov v10.16b, v9.16b 12111: tbz x12, #2, 1f 1212 ext v11.16b, v10.16b, v11.16b, #4*2 1213 ext v10.16b, v9.16b, v10.16b, #4*2 1214 .if \step == 1 1215 1: tbz x12, #1, 1f 1216 ext v11.16b, v10.16b, v11.16b, #2*2 1217 ext v10.16b, v9.16b, v10.16b, #2*2 1218 1: tbz x12, #0, 1f 1219 ext v11.16b, v10.16b, v11.16b, #1*2 1220 ext v10.16b, v9.16b, v10.16b, #1*2 1221 .endif 12221: sub x1, x1, x10 1223 sub x15, x15, x10 1224 sub x16, x16, x10 1225 bic x10, x10, #15 1226 add x1, x1, x10 1227 add x15, x15, x10 1228 add x16, x16, x10 12292: 1230 .if \step > 1 1231 /* it's only in the uchar2 and uchar4 cases where the register file 1232 * is insufficient (given MAX_R <= 25). 1233 */ 1234 prefetch_one xx, xx, 192, c=\max_r, step=\step, store=1 1235 prefetch_one xx, xx, 176, c=\max_r, step=\step, store=1 1236 prefetch_one xx, xx, 160, c=\max_r, step=\step, store=1 1237 prefetch_one xx, xx, 144, c=\max_r, step=\step, store=1 1238 prefetch_one xx, xx, 128, c=\max_r, step=\step, store=1 1239 prefetch_one xx, xx, 112, c=\max_r, step=\step, store=1 1240 prefetch_one xx, xx, 96, c=\max_r, step=\step, store=1 1241 prefetch_one xx, xx, 80, c=\max_r, step=\step, store=1 1242 prefetch_one xx, xx, 64, c=\max_r, step=\step, store=1 1243 prefetch_one xx, xx, 48, c=\max_r, step=\step, store=1 1244 .else 1245 /* q3 normally contains the coefficient table, but it's not fully 1246 * used. In the uchar1, r=25 case the other half of q3 is used for 1247 * the last two window taps to avoid falling out to memory. 1248 */ 1249 prefetch_one xx, v3.d[1], 48, c=\max_r, step=\step, store=-1 1250 .endif 1251 prefetch_one v4.16b, v5.16b, 32, c=\max_r, step=\step, store=0 1252 prefetch_one v6.16b, v7.16b, 16, c=\max_r, step=\step, store=0 1253 prefetch_one v8.16b, v9.16b, 0, c=\max_r, step=\step, store=0 1254 1255 .if \step == 1 1256 add x10, x8, #\max_r * \step 1257 .else 1258 lsl x10, x8, #2 1259 add x10, x10, #\max_r * \step 1260 .endif 1261 subs x4, x4, x10 1262 csel x4, xzr, x4, lo 1263.endm 1264 1265/* The main loop. 1266 * 1267 * Input: 1268 * x0 = dst 1269 * x1 = src 1270 * x2 = pitch 1271 * x3 = count 1272 * x4 = inlen 1273 * x5 = r 1274 * x6 = rup 1275 * x7 = rdn 1276 * x9 = buffer 1277 * x13 = -pitch 1278 * x15 = top-row in 1279 * x16 = bottom-row in 1280 * Modifies 1281 * x8 = fetch code pointer 1282 */ 1283.macro mainloop core, step=1, max_r=25, labelc="", labelnc="" 1284 adrp x8, \labelnc 1285 add x8, x8, #:lo12:\labelnc 1286 sub x8, x8, x5, LSL #5 1287 sub x8, x8, x5, LSL #3 1288 cmp x5, x6 1289 ccmp x5, x7, #0, eq 1290 beq 5f 1291 1292 /* if (r != rup || r != rdn) then the address-clamping table should 1293 * be used rather than the short-cut version. 1294 */ 1295 adrp x8, \labelc 1296 add x8, x8, #:lo12:\labelc 1297 sub x8, x8, x5, LSL #6 1298 add x8, x8, x5, LSL #3 1299 b 5f 1300 .align 4 13013: fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8 1302 1303 /* For each call to fetch two are made to \core. It would be 1304 * preferable to have twice the work done in \core. 1305 */ 1306 \core 1307 st1 {v15.8b}, [x0], #8 1308 \core 1309 st1 {v15.8b}, [x0], #8 1310 1311 sub x3, x3, #16 13125: subs x4, x4, #16 1313 bhs 3b 1314 adds x4, x4, #16 1315 bne 1f 1316 .if \step==1 1317 dup v10.8h, v9.h[7] 1318 dup v11.8h, v9.h[7] 1319 .else 1320 dup v10.2d, v9.d[1] 1321 dup v11.2d, v9.d[1] 1322 .endif 1323 b 4f 1324 13251: sub x1, x1, #16 1326 sub x15, x15, #16 1327 sub x16, x16, #16 1328 add x1, x1, x4 1329 add x15, x15, x4 1330 add x16, x16, x4 1331 bl fetch_generic_asm 1332 1333 .if \step==1 1334 dup v12.8h, v11.h[7] 1335 .else 1336 dup v12.2d, v11.d[1] 1337 .endif 1338 sub x4, xzr, x4 1339 tbz x4, #3, 1f 1340 mov v10.16b, v11.16b 1341 mov v11.16b, v12.16b 13421: tbz x4, #2, 1f 1343 ext v10.16b, v10.16b, v11.16b, #4*2 1344 ext v11.16b, v11.16b, v12.16b, #4*2 13451: tbz x4, #1, 1f 1346 ext v10.16b, v10.16b, v11.16b, #2*2 1347 ext v11.16b, v11.16b, v12.16b, #2*2 13481: tbz x4, #0, 4f 1349 ext v10.16b, v10.16b, v11.16b, #1*2 1350 ext v11.16b, v11.16b, v12.16b, #1*2 13514: cbz x3, 5f 13523: \core 1353 .if \step==1 1354 dup v11.8h, v11.h[7] 1355 .else 1356 dup v11.2d, v11.d[1] 1357 .endif 1358 subs x3, x3, #8 1359 blo 4f 1360 st1 {v15.8b}, [x0], #8 1361 beq 5f 1362 b 3b 13634: tbz x3, #2, 1f 1364 st1 {v15.s}[0], [x0], #4 1365 ext v15.16b, v15.16b, v15.16b, #4*2 13661: tbz x3, #1, 1f 1367 st1 {v15.h}[0], [x0], #2 1368 ext v15.16b, v15.16b, v15.16b, #2*2 13691: tbz x3, #0, 5f 1370 st1 {v15.b}[0], [x0], #1 1371 ext v15.16b, v15.16b, v15.16b, #1*2 13725: nop 1373.endm 1374 1375.irep r, TUNED_LIST1, 25 1376ENTRY(convolve1_\r) 1377 stp x29,x30, [sp, #-16]! 1378 1379 prefetch step=1, max_r=\r 1380 1381 mainloop core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r 1382 1383 ldp x29,x30, [sp], #16 1384 ret 1385END(convolve1_\r) 1386.endr 1387 1388.irep r, TUNED_LIST4, 25 1389ENTRY(convolve4_\r) 1390 sub x12, sp, #0x200 1391 bic x9, x12, #0x3fc 1392 mov sp, x9 1393 stp x12,x30, [sp, #-16]! 1394 1395 /* x9 now points to a buffer on the stack whose address has the low 1396 * 10 bits clear. This allows easy address calculation in the 1397 * wrap-around cases. 1398 */ 1399 1400 1401 prefetch step=4, max_r=\r 1402 1403 mainloop core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r 1404 1405 ldp x12,x30, [sp] 1406 add sp, x12, #0x200 1407 ret 1408END(convolve4_\r) 1409.endr 1410 1411/* void rsdIntrinsicBlurU1_K( 1412 * void *out, // x0 1413 * void *in, // x1 1414 * size_t w, // x2 1415 * size_t h, // x3 1416 * size_t p, // x4 1417 * size_t x, // x5 1418 * size_t y, // x6 1419 * size_t count, // x7 1420 * size_t r, // [sp] 1421 * uint16_t *tab); // [sp,#8] 1422 */ 1423ENTRY(rsdIntrinsicBlurU1_K) 1424 stp x16,x30, [sp, #-80]! 1425 stp x14,x15, [sp, #16] 1426 stp x12,x13, [sp, #32] 1427 stp x10,x11, [sp, #48] 1428 stp x8,x9, [sp, #64] 1429 sub x8, sp, #32 1430 sub sp, sp, #64 1431 st1 {v8.1d - v11.1d}, [sp] 1432 st1 {v12.1d - v15.1d}, [x8] 1433 mov x8, x5 // x 1434 ldr w5, [sp,#144] // r 1435 sub x9, x2, x8 1436 sub x10, x3, x6 1437 mov x2, x4 // pitch 1438 mov x3, x7 // count 1439 sub x7, x10, #1 1440 sub x9, x9, x3 1441 1442 ldr x12, [sp, #152] // tab 1443 1444 add x0, x0, x8 1445 add x1, x1, x8 1446 1447 cmp x6, x5 1448 csel x6, x5, x6, hs 1449 cmp x7, x5 1450 csel x7, x5, x7, hs 1451 cmp x8, x5 1452 csel x8, x5, x8, hs 1453 cmp x9, x5 1454 csel x9, x5, x8, hs 1455 1456 add x4, x8, x9 1457 add x4, x4, x3 1458 1459 sub x1, x1, x8 1460 1461 sub x13, xzr, x2 1462 msub x15, x2, x6, x1 1463 madd x16, x2, x7, x1 1464 1465 ld1 {v0.8h,v1.8h}, [x12], #32 1466 ld1 {v2.8h,v3.8h}, [x12], #32 1467 1468 adr x30, 1f 1469 .irep r, TUNED_LIST1 1470 cmp x5, #\r 1471 bls convolve1_\r 1472 .endr 1473 b convolve1_25 1474 14751: ld1 {v8.1d - v11.1d}, [sp], #32 1476 ld1 {v12.1d - v15.1d}, [sp], #32 1477 ldp x8,x9, [sp, #64] 1478 ldp x10,x11, [sp, #48] 1479 ldp x12,x13, [sp, #32] 1480 ldp x14,x15, [sp, #16] 1481 ldp x12,x30, [sp], #80 1482 ret 1483END(rsdIntrinsicBlurU1_K) 1484 1485/* void rsdIntrinsicBlurU4_K( 1486 * void *out, // x0 1487 * void *in, // x1 1488 * size_t w, // x2 1489 * size_t h, // x3 1490 * size_t p, // x4 1491 * size_t x, // x5 1492 * size_t y, // x6 1493 * size_t count, // x7 1494 * size_t r, // [sp] 1495 * uint16_t *tab); // [sp,#8] 1496 */ 1497ENTRY(rsdIntrinsicBlurU4_K) 1498 stp x16,x30, [sp, #-80]! 1499 stp x14,x15, [sp, #16] 1500 stp x12,x13, [sp, #32] 1501 stp x10,x11, [sp, #48] 1502 stp x8,x9, [sp, #64] 1503 sub x8, sp, #32 1504 sub sp, sp, #64 1505 st1 {v8.1d - v11.1d}, [sp] 1506 st1 {v12.1d - v15.1d}, [x8] 1507 mov x8, x5 // x 1508 ldr w5, [sp,#144] // r 1509 sub x9, x2, x8 1510 sub x10, x3, x6 1511 mov x2, x4 // pitch 1512 mov x3, x7 // count 1513 sub x7, x10, #1 1514 sub x9, x9, x3 1515 1516 ldr x12, [sp, #152] 1517 1518 add x0, x0, x8, LSL #2 1519 add x1, x1, x8, LSL #2 1520 1521 cmp x6, x5 1522 csel x6, x5, x6, hs 1523 cmp x7, x5 1524 csel x7, x5, x7, hs 1525 cmp x8, x5 1526 csel x8, x5, x8, hs 1527 cmp x9, x5 1528 csel x9, x5, x9, hs 1529 1530 lsl x3, x3, #2 1531 add x4, x8, x9 1532 add x4, x3, x4, LSL #2 1533 1534 sub x1, x1, x8, LSL #2 1535 1536 sub x13, xzr, x2 1537 msub x15, x2, x6, x1 1538 madd x16, x2, x7, x1 1539 1540 ld1 {v0.8h,v1.8h}, [x12], #32 1541 ld1 {v2.8h,v3.8h}, [x12], #32 1542 1543 adr x30, 1f 1544 .irep r, TUNED_LIST4 1545 cmp x5, #\r 1546 bls convolve4_\r 1547 .endr 1548 b convolve4_25 1549 15501: ld1 {v8.1d - v11.1d}, [sp], #32 1551 ld1 {v12.1d - v15.1d}, [sp], #32 1552 ldp x8,x9, [sp, #64] 1553 ldp x10,x11, [sp, #48] 1554 ldp x12,x13, [sp, #32] 1555 ldp x14,x15, [sp, #16] 1556 ldp x12,x30, [sp], #80 1557 ret 1558END(rsdIntrinsicBlurU4_K) 1559